# Libraries to help with reading and manipulating data
from pprint import pprint
import numpy as np
import pandas as pd
# Libraries to help with data visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from IPython.core.display import display
# Libraries to tune model, get different metric scores, and split data
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score,precision_score,recall_score, plot_confusion_matrix
from sklearn import metrics
# Library to impute missing values
from sklearn.impute import KNNImputer
#libraries to help with model building
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (
AdaBoostClassifier,
BaggingClassifier,
GradientBoostingClassifier,
RandomForestClassifier)
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
pd.options.display.max_columns = None #forcing jupyter to display all columns
interactive_charts = True #flag to turn on or off plotly.express charts below
run_grid_search = True #flag to turn on or off gridcv searches
random_state = 314159 #random state for all function calls that need a random state
#Function to make a summary df with column types, etc.
def make_summary_cols(df, cat_threshold=20):
'''Create a df to summarise the data df based on dtypes, nulls, numeric or not, categorical or not
df: data threshold
cat_threshold: number of unique types below which assume categorical
returns:
summary_cols: df of summary
d: dict of column names where keys are numeric_cols, categorical_cols and non_numeric_cols where
values against the keys contain corresponding column names
'''
types = df.dtypes
types.name = 'col_types'
nuniques = data.nunique()
nuniques.name = 'n_uniques'
nulls = df.isnull().sum()
nulls.name = 'nulls'
summary_cols = pd.merge(left=pd.merge(left=nuniques, right=types, left_index=True, right_index=True), right=nulls,\
left_index=True, right_index=True).sort_values(by='col_types')
summary_cols['isnumeric_column'] = summary_cols['col_types'].apply(lambda x: False if x=='object' else True)
summary_cols['probably_categorical'] = summary_cols['n_uniques'].apply(lambda x: True if x <=cat_threshold \
else False)
d = {
'numeric_cols': list(summary_cols[summary_cols.isnumeric_column==True].index),
'categorical_cols': list(summary_cols[summary_cols.probably_categorical==True].index),
'non_numeric_cols': list(summary_cols[summary_cols.isnumeric_column==False].index)
}
return summary_cols, d
#Function to plot histogram and boxplot together
def histogram_boxplot(feature, figsize=(15,10), bins = None):
""" Boxplot and histogram combined
feature: 1-d feature array
figsize: size of fig (default (9,8))
bins: number of bins (default None / auto)
"""
f2, (ax_box2, ax_hist2) = plt.subplots(nrows = 2, # Number of rows of the subplot grid= 2
sharex = True, # x-axis will be shared among all subplots
gridspec_kw = {"height_ratios": (.25, .75)},
figsize = figsize
) # creating the 2 subplots
sns.boxplot(x=feature, ax=ax_box2, showmeans=True, color='violet') # boxplot will be created and a star will indicate the mean value of the column
sns.histplot(feature, kde=False, ax=ax_hist2, bins=bins) if bins else sns.histplot(feature, \
kde=True, ax=ax_hist2) # For histogram
ax_hist2.axvline(np.mean(feature), color='red', linestyle='--') # Add mean to the histogram
ax_hist2.axvline(np.median(feature), color='blue', linestyle='-') # Add median to the histogram
#Function to plot bar chart of unique values of feature with % freq. of apperance as labels
def perc_on_bar(z):
'''
plot
feature: categorical feature
the function won't work if a column is passed in hue parameter
'''
total = len(data[z]) # length of the column
plt.figure(figsize=(15,5))
ax = sns.countplot(x=data[z],palette='Paired')
for p in ax.patches:
percentage = '{:.1f}%'.format(100 * p.get_height()/total) # percentage of each class of the category
x = p.get_x() + p.get_width() / 2 - 0.05 # width of the plot
y = p.get_y() + p.get_height() # hieght of the plot
ax.annotate(percentage, (x, y), size = 12) # annotate the percantage
plt.show() # show the plot
### Function to plot stacked bar charts for categorical columns
def stacked_plot(feature_name, target_name, df):
sns.set()
## crosstab
tab1 = pd.crosstab(feature_name,df[target_name],margins=True)#.sort_values(by='yes',ascending=False)
#print(tab1)
#print('-'*120)
## visualising the cross tab
tab = pd.crosstab(df[feature_name],df[target_name],normalize='index')#.sort_values(by='yes',ascending=False)
tab.plot(kind='bar',stacked=True,figsize=(12,5))
plt.legend(loc='lower left', frameon=False)
plt.legend(loc="upper left", bbox_to_anchor=(1,1))
plt.show()
### Function to plot distributions and Boxplots of enrollees
def plot_by_target(feature_name,target_name, df):
fig,axs = plt.subplots(2,2,figsize=(12,10))
axs[0, 0].set_title('Existing Customer')
sns.distplot(df[(df[target_name] == 0)][feature_name],ax=axs[0,0],color='teal')
axs[0, 1].set_title('Attrited Customer')
sns.distplot(df[(df[target_name] == 1)][feature_name],ax=axs[0,1],color='orange')
axs[1,0].set_title('Boxplot w.r.t target')
sns.boxplot(df[target_name],df[feature_name],ax=axs[1,0],palette='gist_rainbow')
axs[1,1].set_title('Boxplot w.r.t target - Without outliers')
sns.boxplot(df[target_name],df[feature_name],ax=axs[1,1],showfliers=False,palette='gist_rainbow')
plt.tight_layout()
plt.show()
### Function to plot confusion matrix with desriptive labels
def make_confusion_matrix(model,y_actual,labels=[1, 0]):
'''
model : classifier to predict values of X
y_actual : ground truth
'''
y_predict = model.predict(X_test)
cm=metrics.confusion_matrix( y_actual, y_predict, labels=[0, 1])
df_cm = pd.DataFrame(cm, index = [i for i in ["Actual - No","Actual - Yes"]],
columns = [i for i in ['Predicted - No','Predicted - Yes']])
group_counts = ["{0:0.0f}".format(value) for value in
cm.flatten()]
group_percentages = ["{0:.2%}".format(value) for value in
cm.flatten()/np.sum(cm)]
labels = [f"{v1}\n{v2}" for v1, v2 in
zip(group_counts,group_percentages)]
labels = np.asarray(labels).reshape(2,2)
plt.figure(figsize = (10,7))
sns.heatmap(df_cm, annot=labels,fmt='', cmap='coolwarm')
plt.ylabel('True label')
plt.xlabel('Predicted label')
### Function to create a summary df with scores on various metrics for each model it is called on
def model_score_df(model, X_train, y_train, X_test, y_test, label='model', score_df=None):
'''
model : classifier to predict values of X
X_train: training data
y_train: target values in training data
X_test: testing data
y_test: target values in testing data
label: string identifier for model (used in df column names)
score_df: if None, will create else will append results to score_df
returns:
score_df: dataframe with train and test scores for model appended
'''
# Scores on Test Data
y_predict = model.predict(X_test)
TN, FP, FN, TP = metrics.confusion_matrix(y_test, y_predict).ravel()
TNR = TN/(TN+FP)
FPR = 1-TNR
x={'model_accuracy':metrics.accuracy_score(y_test, y_predict),
'model_recall_sensitivity_TPR':metrics.recall_score(y_test, y_predict),
'model_precision':metrics.precision_score(y_test, y_predict),
'Specificity_TNR':TNR,
'model_f1':metrics.f1_score(y_test, y_predict),
'FPR': FPR}
df1 = pd.DataFrame(x, index=[label+'-test']).T
#display(df1)
# Scores on Train Data
y_predict = model.predict(X_train)
TN, FP, FN, TP = metrics.confusion_matrix(y_train, y_predict).ravel()
TNR = TN/(TN+FP)
FPR = 1-TNR
x={'model_accuracy':metrics.accuracy_score(y_train, y_predict),
'model_recall_sensitivity_TPR':metrics.recall_score(y_train, y_predict),
'model_precision':metrics.precision_score(y_train, y_predict),
'Specificity_TNR':TNR,
'model_f1':metrics.f1_score(y_train, y_predict),
'FPR': FPR}
df2 = pd.DataFrame(x, index=[label+'-train']).T
#display(df2)
df_2_1 = pd.merge(df2, df1, left_index=True, right_index=True)
if score_df is None:
score_df = df_2_1
else:
score_df = pd.merge(score_df, df_2_1, left_index=True, right_index=True)
return score_df
#Function to run kfolds and return scores as df
def kfolds(model, X_train, y_train, label, scoring='recall', results_df = None):
'''creates a df with kfold scores as rows and model name (label) as column.
If df is passed, then column is appended'''
print(model)
kfold=StratifiedKFold(n_splits=5,shuffle=True,random_state=random_state) #Setting number of splits equal to 5
cv_result_bfr=cross_val_score(estimator=model, X=X_train, y=y_train, scoring=scoring, cv=kfold)
df = pd.DataFrame(cv_result_bfr, columns=[label])
#display(df)
if results_df is None:
results_df = df
else:
results_df = pd.merge(results_df, df, left_index=True, right_index=True)
return results_df
#Function to return cv scores from cv object as df
def make_cv_scores_df(cv_obj, label, df = None):
'''returns df with cv scores from cv object passed in.
If df already exists, column is appended else df created.'''
#print(cv_obj)
scores = cv_obj.cv_results_['mean_test_score']
temp_df = pd.DataFrame(scores, columns=[label])
#display(df)
if df is None:
df = temp_df
else:
df = pd.merge(df, temp_df, left_index=True, right_index=True)
return df
#Function to plot cv scores from df returned by make_cv_scores_df
def plot_cv_scores(kfold_results_df, title='Comparing Recall Scores'):
'''plots df scores'''
plt.figure(figsize=(15,10))
plt.suptitle(title)
sns.boxplot(data=kfold_results_df.melt(), y='value', x = 'variable')
#Load data from file, inspect size and head
data = pd.read_csv('BankChurners.csv')
display(data.head())
display(data.shape)
| CLIENTNUM | Attrition_Flag | Customer_Age | Gender | Dependent_count | Education_Level | Marital_Status | Income_Category | Card_Category | Months_on_book | Total_Relationship_Count | Months_Inactive_12_mon | Contacts_Count_12_mon | Credit_Limit | Total_Revolving_Bal | Avg_Open_To_Buy | Total_Amt_Chng_Q4_Q1 | Total_Trans_Amt | Total_Trans_Ct | Total_Ct_Chng_Q4_Q1 | Avg_Utilization_Ratio | Unnamed: 21 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 768805383 | Existing Customer | 45 | M | 3 | High School | Married | $60K - $80K | Blue | 39 | 5 | 1 | 3 | 12691.0 | 777 | 11914.0 | 1.335 | 1144 | 42 | 1.625 | 0.061 | NaN |
| 1 | 818770008 | Existing Customer | 49 | F | 5 | Graduate | Single | Less than $40K | Blue | 44 | 6 | 1 | 2 | 8256.0 | 864 | 7392.0 | 1.541 | 1291 | 33 | 3.714 | 0.105 | NaN |
| 2 | 713982108 | Existing Customer | 51 | M | 3 | Graduate | Married | $80K - $120K | Blue | 36 | 4 | 1 | 0 | 3418.0 | 0 | 3418.0 | 2.594 | 1887 | 20 | 2.333 | 0.000 | NaN |
| 3 | 769911858 | Existing Customer | 40 | F | 4 | High School | Unknown | Less than $40K | Blue | 34 | 3 | 4 | 1 | 3313.0 | 2517 | 796.0 | 1.405 | 1171 | 20 | 2.333 | 0.760 | NaN |
| 4 | 709106358 | Existing Customer | 40 | M | 3 | Uneducated | Married | $60K - $80K | Blue | 21 | 5 | 1 | 0 | 4716.0 | 0 | 4716.0 | 2.175 | 816 | 28 | 2.500 | 0.000 | NaN |
(10127, 22)
# CLIENTNUM seems like an ID type variable - check no of unique values
data['CLIENTNUM'].nunique()
10127
#Check each column for statistical spread etc
data.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| CLIENTNUM | 10127.0 | 7.391776e+08 | 3.690378e+07 | 708082083.0 | 7.130368e+08 | 7.179264e+08 | 7.731435e+08 | 8.283431e+08 |
| Customer_Age | 10127.0 | 4.632596e+01 | 8.016814e+00 | 26.0 | 4.100000e+01 | 4.600000e+01 | 5.200000e+01 | 7.300000e+01 |
| Dependent_count | 10127.0 | 2.346203e+00 | 1.298908e+00 | 0.0 | 1.000000e+00 | 2.000000e+00 | 3.000000e+00 | 5.000000e+00 |
| Months_on_book | 10127.0 | 3.592841e+01 | 7.986416e+00 | 13.0 | 3.100000e+01 | 3.600000e+01 | 4.000000e+01 | 5.600000e+01 |
| Total_Relationship_Count | 10127.0 | 3.812580e+00 | 1.554408e+00 | 1.0 | 3.000000e+00 | 4.000000e+00 | 5.000000e+00 | 6.000000e+00 |
| Months_Inactive_12_mon | 10127.0 | 2.341167e+00 | 1.010622e+00 | 0.0 | 2.000000e+00 | 2.000000e+00 | 3.000000e+00 | 6.000000e+00 |
| Contacts_Count_12_mon | 10127.0 | 2.455317e+00 | 1.106225e+00 | 0.0 | 2.000000e+00 | 2.000000e+00 | 3.000000e+00 | 6.000000e+00 |
| Credit_Limit | 10127.0 | 8.631954e+03 | 9.088777e+03 | 1438.3 | 2.555000e+03 | 4.549000e+03 | 1.106750e+04 | 3.451600e+04 |
| Total_Revolving_Bal | 10127.0 | 1.162814e+03 | 8.149873e+02 | 0.0 | 3.590000e+02 | 1.276000e+03 | 1.784000e+03 | 2.517000e+03 |
| Avg_Open_To_Buy | 10127.0 | 7.469140e+03 | 9.090685e+03 | 3.0 | 1.324500e+03 | 3.474000e+03 | 9.859000e+03 | 3.451600e+04 |
| Total_Amt_Chng_Q4_Q1 | 10127.0 | 7.599407e-01 | 2.192068e-01 | 0.0 | 6.310000e-01 | 7.360000e-01 | 8.590000e-01 | 3.397000e+00 |
| Total_Trans_Amt | 10127.0 | 4.404086e+03 | 3.397129e+03 | 510.0 | 2.155500e+03 | 3.899000e+03 | 4.741000e+03 | 1.848400e+04 |
| Total_Trans_Ct | 10127.0 | 6.485869e+01 | 2.347257e+01 | 10.0 | 4.500000e+01 | 6.700000e+01 | 8.100000e+01 | 1.390000e+02 |
| Total_Ct_Chng_Q4_Q1 | 10127.0 | 7.122224e-01 | 2.380861e-01 | 0.0 | 5.820000e-01 | 7.020000e-01 | 8.180000e-01 | 3.714000e+00 |
| Avg_Utilization_Ratio | 10127.0 | 2.748936e-01 | 2.756915e-01 | 0.0 | 2.300000e-02 | 1.760000e-01 | 5.030000e-01 | 9.990000e-01 |
| Unnamed: 21 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
#Dropping clientnum and last incorrect column
data = data.iloc[:,1:-1]
print(f'Data shape after dropping clientnum and last blank col is {data.shape[0]} rows and {data.shape[1]} columns')
Data shape after dropping clientnum and last blank col is 10127 rows and 20 columns
data.isnull().sum()
Attrition_Flag 0 Customer_Age 0 Gender 0 Dependent_count 0 Education_Level 0 Marital_Status 0 Income_Category 0 Card_Category 0 Months_on_book 0 Total_Relationship_Count 0 Months_Inactive_12_mon 0 Contacts_Count_12_mon 0 Credit_Limit 0 Total_Revolving_Bal 0 Avg_Open_To_Buy 0 Total_Amt_Chng_Q4_Q1 0 Total_Trans_Amt 0 Total_Trans_Ct 0 Total_Ct_Chng_Q4_Q1 0 Avg_Utilization_Ratio 0 dtype: int64
#Making a summary dataframe to use as a quick reference for column details - nulls, uniques, dtype and whether or
# not this is categorical (using <= 20 uniques as a threshold)
summary_df, sd = make_summary_cols(data)
summary_df
| n_uniques | col_types | nulls | isnumeric_column | probably_categorical | |
|---|---|---|---|---|---|
| Total_Relationship_Count | 6 | int64 | 0 | True | True |
| Customer_Age | 45 | int64 | 0 | True | False |
| Total_Trans_Ct | 126 | int64 | 0 | True | False |
| Dependent_count | 6 | int64 | 0 | True | True |
| Total_Trans_Amt | 5033 | int64 | 0 | True | False |
| Total_Revolving_Bal | 1974 | int64 | 0 | True | False |
| Months_on_book | 44 | int64 | 0 | True | False |
| Months_Inactive_12_mon | 7 | int64 | 0 | True | True |
| Contacts_Count_12_mon | 7 | int64 | 0 | True | True |
| Total_Amt_Chng_Q4_Q1 | 1158 | float64 | 0 | True | False |
| Avg_Open_To_Buy | 6813 | float64 | 0 | True | False |
| Credit_Limit | 6205 | float64 | 0 | True | False |
| Avg_Utilization_Ratio | 964 | float64 | 0 | True | False |
| Total_Ct_Chng_Q4_Q1 | 830 | float64 | 0 | True | False |
| Card_Category | 4 | object | 0 | False | True |
| Income_Category | 6 | object | 0 | False | True |
| Marital_Status | 4 | object | 0 | False | True |
| Education_Level | 7 | object | 0 | False | True |
| Gender | 2 | object | 0 | False | True |
| Attrition_Flag | 2 | object | 0 | False | True |
#Creating lists of numeric columns, non numeric, categorical and numeric non categorical columns
# to use later as convenience lists
numeric_cols = list(summary_df[summary_df.isnumeric_column==True].index)
non_numeric_cols = list(summary_df[summary_df.isnumeric_column==False].index)
categorical_cols = list(summary_df[summary_df.probably_categorical==True].index)
numeric_non_cat_cols = list(summary_df[(summary_df.isnumeric_column==True) & \
(summary_df.probably_categorical==False)].index)
print('numeric_cols are')
pprint(numeric_cols)
print('\nnon numeric cols are')
pprint(non_numeric_cols)
print('\ncategorical cols are')
pprint(categorical_cols)
print('\nnumeric non cat cols are')
pprint(numeric_non_cat_cols)
numeric_cols are ['Total_Relationship_Count', 'Customer_Age', 'Total_Trans_Ct', 'Dependent_count', 'Total_Trans_Amt', 'Total_Revolving_Bal', 'Months_on_book', 'Months_Inactive_12_mon', 'Contacts_Count_12_mon', 'Total_Amt_Chng_Q4_Q1', 'Avg_Open_To_Buy', 'Credit_Limit', 'Avg_Utilization_Ratio', 'Total_Ct_Chng_Q4_Q1'] non numeric cols are ['Card_Category', 'Income_Category', 'Marital_Status', 'Education_Level', 'Gender', 'Attrition_Flag'] categorical cols are ['Total_Relationship_Count', 'Dependent_count', 'Months_Inactive_12_mon', 'Contacts_Count_12_mon', 'Card_Category', 'Income_Category', 'Marital_Status', 'Education_Level', 'Gender', 'Attrition_Flag'] numeric non cat cols are ['Customer_Age', 'Total_Trans_Ct', 'Total_Trans_Amt', 'Total_Revolving_Bal', 'Months_on_book', 'Total_Amt_Chng_Q4_Q1', 'Avg_Open_To_Buy', 'Credit_Limit', 'Avg_Utilization_Ratio', 'Total_Ct_Chng_Q4_Q1']
#Plotting feature as histogram and boxplot
#(median is blue line and mean is red dotted line on histogram)
histogram_boxplot(data['Customer_Age'])
data['Customer_Age'].describe()
count 10127.000000 mean 46.325960 std 8.016814 min 26.000000 25% 41.000000 50% 46.000000 75% 52.000000 max 73.000000 Name: Customer_Age, dtype: float64
#Checking value counts
data['Customer_Age'].value_counts().sort_index()
26 78 27 32 28 29 29 56 30 70 31 91 32 106 33 127 34 146 35 184 36 221 37 260 38 303 39 333 40 361 41 379 42 426 43 473 44 500 45 486 46 490 47 479 48 472 49 495 50 452 51 398 52 376 53 387 54 307 55 279 56 262 57 223 58 157 59 157 60 127 61 93 62 93 63 65 64 43 65 101 66 2 67 4 68 2 70 1 73 1 Name: Customer_Age, dtype: int64
print(f'Number of unique values = {data["Customer_Age"].nunique()}')
Number of unique values = 45
#Using plotly.express to show an interactive chart to quickly check values of outliers and other boxplot metrics interactively
#If not needed, please turn interactive_charts to False at start of notebook
if interactive_charts == True:
fig = px.box(data['Customer_Age'], orientation='h', height = 250)
fig.show()
#Plotting feature as histogram and boxplot
# (median is blue line and mean is red dotted line on histogram)
histogram_boxplot(data['Total_Trans_Ct'])
data['Total_Trans_Ct'].describe()
count 10127.000000 mean 64.858695 std 23.472570 min 10.000000 25% 45.000000 50% 67.000000 75% 81.000000 max 139.000000 Name: Total_Trans_Ct, dtype: float64
#Checking value counts
data['Total_Trans_Ct'].value_counts().sort_index()
10 4
11 2
12 4
13 5
14 9
..
131 6
132 1
134 1
138 1
139 1
Name: Total_Trans_Ct, Length: 126, dtype: int64
print(f'Number of unique values = {data["Total_Trans_Ct"].nunique()}')
Number of unique values = 126
#checking last few values on upper end to examine outliers
# (we know outliers are on upper end from boxplot)
data['Total_Trans_Ct'].sort_values()[-40:]
9511 127 9600 127 9818 127 9300 127 9861 127 9106 127 9258 127 9550 127 9691 127 9813 128 10100 128 9317 128 9983 128 9614 128 9839 128 9646 128 9468 128 9560 128 9563 128 9575 129 9652 129 9595 129 9259 129 9510 129 9431 129 9643 130 9711 130 9960 130 9411 130 10060 130 9728 131 10085 131 9339 131 9841 131 9269 131 9261 131 9629 132 9213 134 9586 138 9324 139 Name: Total_Trans_Ct, dtype: int64
#Using plotly.express to show an interactive chart to quickly check values of outliers and other boxplot metrics interactively
#If not needed, please turn interactive_charts to False at start of notebook
if interactive_charts:
fig = px.box(data['Total_Trans_Ct'], orientation='h', height = 200)
fig.show()
#Plotting feature as histogram and boxplot (median is blue line and mean is red dotted line on histogram)
histogram_boxplot(data['Total_Trans_Amt'])
data['Total_Trans_Amt'].describe()
count 10127.000000 mean 4404.086304 std 3397.129254 min 510.000000 25% 2155.500000 50% 3899.000000 75% 4741.000000 max 18484.000000 Name: Total_Trans_Amt, dtype: float64
#Checking value counts
data['Total_Trans_Amt'].value_counts()
4509 11
4253 11
4518 10
2229 10
4042 9
..
804 1
2869 1
10468 1
15163 1
8192 1
Name: Total_Trans_Amt, Length: 5033, dtype: int64
#Using plotly.express to show an interactive chart to quickly check values of outliers and other boxplot metrics interactively
#If not needed, please turn interactive_charts to False at start of notebook
if interactive_charts:
fig = px.box(data['Total_Trans_Amt'], orientation='h', height = 200)
fig.show()
#Plotting feature as histogram and boxplot (median is blue line and mean is red dotted line on histogram)
histogram_boxplot(data['Total_Revolving_Bal'])
data['Total_Revolving_Bal'].describe()
count 10127.000000 mean 1162.814061 std 814.987335 min 0.000000 25% 359.000000 50% 1276.000000 75% 1784.000000 max 2517.000000 Name: Total_Revolving_Bal, dtype: float64
print(f'Number of uniques is: {data["Total_Revolving_Bal"].nunique()}')
Number of uniques is: 1974
data['Total_Revolving_Bal'].value_counts().sort_index()
0 2470
132 1
134 1
145 1
154 1
...
2511 1
2512 2
2513 1
2514 3
2517 508
Name: Total_Revolving_Bal, Length: 1974, dtype: int64
#Using plotly.express to show an interactive chart to quickly check values of outliers and other boxplot metrics interactively
#If not needed, please turn interactive_charts to False at start of notebook
if interactive_charts:
fig = px.box(data['Total_Revolving_Bal'], orientation='h', height = 200)
fig.show()
#Plotting feature as histogram and boxplot (median is blue line and mean is red dotted line on histogram)
histogram_boxplot(data['Months_on_book'])
data['Months_on_book'].describe()
count 10127.000000 mean 35.928409 std 7.986416 min 13.000000 25% 31.000000 50% 36.000000 75% 40.000000 max 56.000000 Name: Months_on_book, dtype: float64
print(f'Number of uniques is {data["Months_on_book"].nunique()}')
Number of uniques is 44
#Checking value counts
data['Months_on_book'].value_counts().sort_index()
13 70 14 16 15 34 16 29 17 39 18 58 19 63 20 74 21 83 22 105 23 116 24 160 25 165 26 186 27 206 28 275 29 241 30 300 31 318 32 289 33 305 34 353 35 317 36 2463 37 358 38 347 39 341 40 333 41 297 42 271 43 273 44 230 45 227 46 197 47 171 48 162 49 141 50 96 51 80 52 62 53 78 54 53 55 42 56 103 Name: Months_on_book, dtype: int64
#Using plotly.express to show an interactive chart to quickly check values of outliers and other boxplot metrics interactively
#If not needed, please turn interactive_charts to False at start of notebook
if interactive_charts:
fig = px.box(data['Months_on_book'], orientation='h', height = 200)
fig.show()
#Plotting feature as histogram and boxplot (median is blue line and mean is red dotted line on histogram)
histogram_boxplot(data['Total_Amt_Chng_Q4_Q1'])
data['Total_Amt_Chng_Q4_Q1'].describe()
count 10127.000000 mean 0.759941 std 0.219207 min 0.000000 25% 0.631000 50% 0.736000 75% 0.859000 max 3.397000 Name: Total_Amt_Chng_Q4_Q1, dtype: float64
print(f'Number of uniques is {data["Total_Amt_Chng_Q4_Q1"].nunique()}')
Number of uniques is 1158
#Checking value counts
data['Total_Amt_Chng_Q4_Q1'].value_counts().sort_index()
0.000 5
0.010 1
0.018 1
0.046 1
0.061 2
..
2.368 1
2.594 1
2.675 1
3.355 1
3.397 1
Name: Total_Amt_Chng_Q4_Q1, Length: 1158, dtype: int64
#Checking upper end outliers
data['Total_Amt_Chng_Q4_Q1'].sort_values()[-20:]
431 2.023 1873 2.037 1085 2.041 177 2.053 1219 2.103 154 2.121 284 2.145 4 2.175 841 2.180 7 2.204 466 2.271 58 2.275 658 2.282 46 2.316 47 2.357 219 2.368 2 2.594 773 2.675 8 3.355 12 3.397 Name: Total_Amt_Chng_Q4_Q1, dtype: float64
#Using plotly.express to show an interactive chart to quickly check values of outliers and other boxplot metrics interactively
#If not needed, please turn interactive_charts to False at start of notebook
if interactive_charts:
fig = px.box(data['Total_Amt_Chng_Q4_Q1'], orientation='h', height = 200)
fig.show()
# Capping age variable
data["Total_Amt_Chng_Q4_Q1"].clip(upper=2.368, inplace=True)
histogram_boxplot(data['Total_Amt_Chng_Q4_Q1'])
#Plotting feature as histogram and boxplot (median is blue line and mean is red dotted line on histogram)
histogram_boxplot(data['Avg_Open_To_Buy'])
data['Avg_Open_To_Buy'].describe()
count 10127.000000 mean 7469.139637 std 9090.685324 min 3.000000 25% 1324.500000 50% 3474.000000 75% 9859.000000 max 34516.000000 Name: Avg_Open_To_Buy, dtype: float64
print(f'Number of uniques is: {data["Avg_Open_To_Buy"].nunique()}')
Number of uniques is: 6813
#Checking value counts
data['Avg_Open_To_Buy'].value_counts().sort_index()
3.0 1
10.0 1
14.0 2
15.0 1
24.0 1
..
34297.0 1
34300.0 1
34302.0 1
34362.0 1
34516.0 98
Name: Avg_Open_To_Buy, Length: 6813, dtype: int64
#Using plotly.express to show an interactive chart to quickly check values of outliers and other boxplot metrics interactively
#If not needed, please turn interactive_charts to False at start of notebook
if interactive_charts:
fig = px.box(data['Avg_Open_To_Buy'], orientation='h', height = 200)
fig.show()
#Plotting feature as histogram and boxplot (median is blue line and mean is red dotted line on histogram)
histogram_boxplot(data['Credit_Limit'])
data['Credit_Limit'].describe()
count 10127.000000 mean 8631.953698 std 9088.776650 min 1438.300000 25% 2555.000000 50% 4549.000000 75% 11067.500000 max 34516.000000 Name: Credit_Limit, dtype: float64
print(f"Number of Uniques: {data['Credit_Limit'].nunique()}")
Number of Uniques: 6205
#Checking value counts
data['Credit_Limit'].value_counts().sort_index()
1438.3 507
1439.0 2
1440.0 1
1441.0 2
1442.0 1
...
34198.0 1
34427.0 1
34458.0 1
34496.0 1
34516.0 508
Name: Credit_Limit, Length: 6205, dtype: int64
#Using plotly.express to show an interactive chart to quickly check values of outliers and other boxplot metrics interactively
#If not needed, please turn interactive_charts to False at start of notebook
if interactive_charts:
fig = px.box(data['Credit_Limit'], orientation='h', height = 200)
fig.show()
#Plotting feature as histogram and boxplot (median is blue line and mean is red dotted line on histogram)
histogram_boxplot(data['Avg_Utilization_Ratio'])
data['Avg_Utilization_Ratio'].describe()
count 10127.000000 mean 0.274894 std 0.275691 min 0.000000 25% 0.023000 50% 0.176000 75% 0.503000 max 0.999000 Name: Avg_Utilization_Ratio, dtype: float64
print(f"Number of uniques: {data['Avg_Utilization_Ratio'].nunique()}")
Number of uniques: 964
#Checking value counts
data['Avg_Utilization_Ratio'].value_counts()
0.000 2470
0.073 44
0.057 33
0.048 32
0.060 30
...
0.335 1
0.985 1
0.949 1
0.818 1
0.972 1
Name: Avg_Utilization_Ratio, Length: 964, dtype: int64
#Using plotly.express to show an interactive chart to quickly check values of outliers and other boxplot metrics interactively
#If not needed, please turn interactive_charts to False at start of notebook
if interactive_charts:
fig = px.box(data['Avg_Utilization_Ratio'], orientation='h', height = 200)
fig.show()
#Plotting feature as histogram and boxplot (median is blue line and mean is red dotted line on histogram)
histogram_boxplot(data['Total_Ct_Chng_Q4_Q1'])
data['Total_Ct_Chng_Q4_Q1'].describe()
count 10127.000000 mean 0.712222 std 0.238086 min 0.000000 25% 0.582000 50% 0.702000 75% 0.818000 max 3.714000 Name: Total_Ct_Chng_Q4_Q1, dtype: float64
print(f"Number of uniques: {data['Total_Ct_Chng_Q4_Q1'].nunique()}")
Number of uniques: 830
#Checking value counts
data['Total_Ct_Chng_Q4_Q1'].value_counts()
0.667 171
1.000 166
0.500 161
0.750 156
0.600 113
...
0.859 1
2.083 1
0.473 1
1.075 1
1.074 1
Name: Total_Ct_Chng_Q4_Q1, Length: 830, dtype: int64
data['Total_Ct_Chng_Q4_Q1'].sort_values()[-40:]
76 1.875 323 1.875 2358 1.882 151 1.909 2696 1.923 52 1.923 300 2.000 13 2.000 1256 2.000 456 2.000 69 2.000 231 2.000 84 2.000 1455 2.000 294 2.083 309 2.100 162 2.167 91 2.182 131 2.200 1095 2.222 757 2.222 239 2.273 167 2.286 2 2.333 3 2.333 280 2.400 68 2.400 158 2.429 2510 2.500 805 2.500 4 2.500 30 2.571 366 2.750 146 2.875 113 3.000 190 3.000 12 3.250 269 3.500 773 3.571 1 3.714 Name: Total_Ct_Chng_Q4_Q1, dtype: float64
#Using plotly.express to show an interactive chart to quickly check values of outliers and other boxplot metrics interactively
#If not needed, please turn interactive_charts to False at start of notebook
if interactive_charts:
fig = px.box(data['Total_Ct_Chng_Q4_Q1'], orientation='h', height = 200)
fig.show()
data[data['Total_Ct_Chng_Q4_Q1']>1.923].shape
(34, 20)
#Clipping data to 1.923 as upper limit
data['Total_Ct_Chng_Q4_Q1'].clip(upper=1.923, inplace=True)
px.box(data['Total_Ct_Chng_Q4_Q1'], orientation='h', height = 200)
#Plotting feature as bars for each unique value with % of rows containing that value as labels
perc_on_bar('Total_Relationship_Count')
print(f"Number of uniques: {data['Total_Relationship_Count'].nunique()}")
Number of uniques: 6
data['Total_Relationship_Count'].value_counts()
3 2305 4 1912 5 1891 6 1866 2 1243 1 910 Name: Total_Relationship_Count, dtype: int64
#Plotting feature as bars for each unique value with % of rows containing that value as labels
perc_on_bar('Dependent_count')
#data['Dependent_count'].describe()
print(f"Number of uniques: {data['Dependent_count'].nunique()}")
Number of uniques: 6
data['Dependent_count'].value_counts()
3 2732 2 2655 1 1838 4 1574 0 904 5 424 Name: Dependent_count, dtype: int64
#Plotting feature as bars for each unique value with % of rows containing that value as labels
perc_on_bar('Months_Inactive_12_mon')
#data['Months_Inactive_12_mon'].describe()
print(f"Number of uniques: {data['Months_Inactive_12_mon'].nunique()}")
Number of uniques: 7
data['Months_Inactive_12_mon'].value_counts()
3 3846 2 3282 1 2233 4 435 5 178 6 124 0 29 Name: Months_Inactive_12_mon, dtype: int64
#Plotting feature as bars for each unique value with % of rows containing that value as labels
perc_on_bar('Contacts_Count_12_mon')
#data['Contacts_Count_12_mon'].describe()
print(f"Number of uniques: {data['Contacts_Count_12_mon'].nunique()}")
Number of uniques: 7
#Checking value counts
data['Contacts_Count_12_mon'].value_counts()
3 3380 2 3227 1 1499 4 1392 0 399 5 176 6 54 Name: Contacts_Count_12_mon, dtype: int64
#Plotting feature as bars for each unique value with % of rows containing that value as labels
perc_on_bar('Card_Category')
#data['Card_Category'].describe()
data['Card_Category'].value_counts()
Blue 9436 Silver 555 Gold 116 Platinum 20 Name: Card_Category, dtype: int64
#Plotting feature as bars for each unique value with % of rows containing that value as labels
perc_on_bar('Income_Category')
#data['Income_Category'].describe()
#Checking value counts
data['Income_Category'].value_counts()
Less than $40K 3561 $40K - $60K 1790 $80K - $120K 1535 $60K - $80K 1402 Unknown 1112 $120K + 727 Name: Income_Category, dtype: int64
#Plotting feature as bars for each unique value with % of rows containing that value as labels
perc_on_bar('Marital_Status')
#data['Marital_Status'].describe()
#Checking value counts
data['Marital_Status'].value_counts()
Married 4687 Single 3943 Unknown 749 Divorced 748 Name: Marital_Status, dtype: int64
#Plotting feature as bars for each unique value with % of rows containing that value as labels
perc_on_bar('Education_Level')
#data['Education_Level'].describe()
#Checking value counts
data['Education_Level'].value_counts()
Graduate 3128 High School 2013 Unknown 1519 Uneducated 1487 College 1013 Post-Graduate 516 Doctorate 451 Name: Education_Level, dtype: int64
#Plotting feature as bars for each unique value with % of rows containing that value as labels
perc_on_bar('Gender')
#data['Gender'].describe()
#Checking value counts
data['Gender'].value_counts()
F 5358 M 4769 Name: Gender, dtype: int64
#Plotting feature as bars for each unique value with % of rows containing that value as labels
perc_on_bar('Attrition_Flag')
#data['Attrition_Flag'].describe()
#Checking value counts
data['Attrition_Flag'].value_counts()
Existing Customer 8500 Attrited Customer 1627 Name: Attrition_Flag, dtype: int64
#to use in xgboost per https://xgboost.readthedocs.io/en/latest/parameter.html
target_weights = data[data.Attrition_Flag=='Existing Customer'].shape[0]/data[data.Attrition_Flag=='Attrited Customer'].shape[0]
print(f'target weights param for xgboost (scale_pos_weight) is {target_weights}')
target weights param for xgboost (scale_pos_weight) is 5.224339274738783
#Creating a copy df to avoid messing up the data
working_df = data.copy()
## Encoding Categorical Ordinal Variables
Card_Category_d = {'Blue':0, 'Silver':1, 'Gold':2, 'Platinum':3}
working_df['Card_Category']=working_df['Card_Category'].map(Card_Category_d).astype('Int32')
Income_Category_d = {'Less than $40K':0, '$40K - $60K':1, '$60K - $80K':2, '$80K - $120K':3, '$120K +': 4,
'Unknown': np.nan}
working_df['Income_Category']=working_df['Income_Category'].map(Income_Category_d).astype('Int32')
Marital_Status_d = {'Single':0, 'Married':1, 'Divorced':2, 'Unknown':np.nan}
working_df['Marital_Status']=working_df['Marital_Status'].map(Marital_Status_d).astype('Int32')
#Treating college as undergrad
Education_Level_d = {'Uneducated':0, 'High School':1, 'College':2, 'Graduate':3, 'Post-Graduate':4,
'Doctorate':5, 'Unknown': np.nan}
working_df['Education_Level']=working_df['Education_Level'].map(Education_Level_d).astype('Int32')
Gender_d = {'M':0, 'F':1}
working_df['Gender']=working_df['Gender'].map(Gender_d).astype('Int32')
Attrition_Flag_d = {'Existing Customer':0, 'Attrited Customer':1}
working_df['Attrition_Flag']=working_df['Attrition_Flag'].map(Attrition_Flag_d).astype('Int32')
working_df
| Attrition_Flag | Customer_Age | Gender | Dependent_count | Education_Level | Marital_Status | Income_Category | Card_Category | Months_on_book | Total_Relationship_Count | Months_Inactive_12_mon | Contacts_Count_12_mon | Credit_Limit | Total_Revolving_Bal | Avg_Open_To_Buy | Total_Amt_Chng_Q4_Q1 | Total_Trans_Amt | Total_Trans_Ct | Total_Ct_Chng_Q4_Q1 | Avg_Utilization_Ratio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 45 | 0 | 3 | 1 | 1 | 2 | 0 | 39 | 5 | 1 | 3 | 12691.0 | 777 | 11914.0 | 1.335 | 1144 | 42 | 1.625 | 0.061 |
| 1 | 0 | 49 | 1 | 5 | 3 | 0 | 0 | 0 | 44 | 6 | 1 | 2 | 8256.0 | 864 | 7392.0 | 1.541 | 1291 | 33 | 1.923 | 0.105 |
| 2 | 0 | 51 | 0 | 3 | 3 | 1 | 3 | 0 | 36 | 4 | 1 | 0 | 3418.0 | 0 | 3418.0 | 2.368 | 1887 | 20 | 1.923 | 0.000 |
| 3 | 0 | 40 | 1 | 4 | 1 | <NA> | 0 | 0 | 34 | 3 | 4 | 1 | 3313.0 | 2517 | 796.0 | 1.405 | 1171 | 20 | 1.923 | 0.760 |
| 4 | 0 | 40 | 0 | 3 | 0 | 1 | 2 | 0 | 21 | 5 | 1 | 0 | 4716.0 | 0 | 4716.0 | 2.175 | 816 | 28 | 1.923 | 0.000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 10122 | 0 | 50 | 0 | 2 | 3 | 0 | 1 | 0 | 40 | 3 | 2 | 3 | 4003.0 | 1851 | 2152.0 | 0.703 | 15476 | 117 | 0.857 | 0.462 |
| 10123 | 1 | 41 | 0 | 2 | <NA> | 2 | 1 | 0 | 25 | 4 | 2 | 3 | 4277.0 | 2186 | 2091.0 | 0.804 | 8764 | 69 | 0.683 | 0.511 |
| 10124 | 1 | 44 | 1 | 1 | 1 | 1 | 0 | 0 | 36 | 5 | 3 | 4 | 5409.0 | 0 | 5409.0 | 0.819 | 10291 | 60 | 0.818 | 0.000 |
| 10125 | 1 | 30 | 0 | 2 | 3 | <NA> | 1 | 0 | 36 | 4 | 3 | 3 | 5281.0 | 0 | 5281.0 | 0.535 | 8395 | 62 | 0.722 | 0.000 |
| 10126 | 1 | 43 | 1 | 2 | 3 | 1 | 0 | 1 | 25 | 6 | 2 | 4 | 10388.0 | 1961 | 8427.0 | 0.703 | 10294 | 61 | 0.649 | 0.189 |
10127 rows × 20 columns
working_df.dtypes
Attrition_Flag Int32 Customer_Age int64 Gender Int32 Dependent_count int64 Education_Level Int32 Marital_Status Int32 Income_Category Int32 Card_Category Int32 Months_on_book int64 Total_Relationship_Count int64 Months_Inactive_12_mon int64 Contacts_Count_12_mon int64 Credit_Limit float64 Total_Revolving_Bal int64 Avg_Open_To_Buy float64 Total_Amt_Chng_Q4_Q1 float64 Total_Trans_Amt int64 Total_Trans_Ct int64 Total_Ct_Chng_Q4_Q1 float64 Avg_Utilization_Ratio float64 dtype: object
#Checking nan values
working_df.isna().sum()
Attrition_Flag 0 Customer_Age 0 Gender 0 Dependent_count 0 Education_Level 1519 Marital_Status 749 Income_Category 1112 Card_Category 0 Months_on_book 0 Total_Relationship_Count 0 Months_Inactive_12_mon 0 Contacts_Count_12_mon 0 Credit_Limit 0 Total_Revolving_Bal 0 Avg_Open_To_Buy 0 Total_Amt_Chng_Q4_Q1 0 Total_Trans_Amt 0 Total_Trans_Ct 0 Total_Ct_Chng_Q4_Q1 0 Avg_Utilization_Ratio 0 dtype: int64
%%time
plt.figure(figsize=(15,15))
sns.heatmap(working_df.corr(), annot=True, fmt='.2f', cmap='coolwarm')
CPU times: user 480 ms, sys: 64.8 ms, total: 544 ms Wall time: 315 ms
<AxesSubplot:>
%%time
sns.pairplot(data, hue='Attrition_Flag', corner=True)
CPU times: user 35.9 s, sys: 496 ms, total: 36.4 s Wall time: 35.8 s
<seaborn.axisgrid.PairGrid at 0x7ff1502f8700>
#Grouping by target and examining mean and median for the numeric, non categorical columns to establish if any pattern
#against target
means_by_target = working_df.loc[:,numeric_non_cat_cols+['Attrition_Flag']].groupby('Attrition_Flag').mean()
median_by_target = working_df.loc[:,numeric_non_cat_cols+['Attrition_Flag']].groupby('Attrition_Flag').median()
print("mean vs. target:")
display(means_by_target)
print("median vs. target:")
display(median_by_target)
mean vs. target:
| Customer_Age | Total_Trans_Ct | Total_Trans_Amt | Total_Revolving_Bal | Months_on_book | Total_Amt_Chng_Q4_Q1 | Avg_Open_To_Buy | Credit_Limit | Avg_Utilization_Ratio | Total_Ct_Chng_Q4_Q1 | |
|---|---|---|---|---|---|---|---|---|---|---|
| Attrition_Flag | ||||||||||
| 0 | 46.262118 | 68.672588 | 4654.655882 | 1256.604118 | 35.880588 | 0.772210 | 7470.273400 | 8726.877518 | 0.296412 | 0.740422 |
| 1 | 46.659496 | 44.933620 | 3095.025814 | 672.822987 | 36.178242 | 0.694277 | 7463.216472 | 8136.039459 | 0.162475 | 0.553848 |
median vs. target:
| Customer_Age | Total_Trans_Ct | Total_Trans_Amt | Total_Revolving_Bal | Months_on_book | Total_Amt_Chng_Q4_Q1 | Avg_Open_To_Buy | Credit_Limit | Avg_Utilization_Ratio | Total_Ct_Chng_Q4_Q1 | |
|---|---|---|---|---|---|---|---|---|---|---|
| Attrition_Flag | ||||||||||
| 0 | 46 | 71 | 4100 | 1364 | 36 | 0.743 | 3469.5 | 4643.5 | 0.211 | 0.721 |
| 1 | 47 | 43 | 2329 | 0 | 36 | 0.701 | 3488.0 | 4178.0 | 0.000 | 0.531 |
def show_values_on_bars(axs):
def _show_on_single_plot(ax):
for p in ax.patches:
_x = p.get_x() + p.get_width() / 2
_y = p.get_y() + p.get_height()
value = '{:.2f}'.format(p.get_height())
ax.text(_x, _y, value, ha="center")
if isinstance(axs, np.ndarray):
for idx, ax in np.ndenumerate(axs):
_show_on_single_plot(ax)
else:
_show_on_single_plot(axs)
#source: https://stackoverflow.com/a/51535326
fig, ax = plt.subplots(5,2,figsize=(20,15), sharex=True)
for i,col in enumerate(means_by_target.columns):
zz=sns.barplot(x=means_by_target.index, y=means_by_target[col], ax=ax[i%5,i%2])
show_values_on_bars(zz)
Against the target the following features have clear patterns:
Total_Trans_Amt: mean of 4655 for existing vs 3095 for attrited customers
Total_Trans_Ct: mean of 68.7 vs 44.9 for existing vs attrited customers
Avg_Utilization_Ratio: mean of 0.30 vs 0.16 for existing vs attrited customer
Total_Revolving_Balance: mean of 1257 vs 673 for existing vs attrited customers
Total_Ct_Chng_Q4_Q1: mean of 0.74 vs 0.55 for existing vs attrited customers
Total_Amt_Chng_Q4_Q1: mean of 0.77 vs 0.69 for existing vs attrited customers
Credit_Limit: mean of 8727 vs 8136 for existing vs attrited customer
As against the above:
#using cross tab to examine relationship between target and categorical columns
for col in categorical_cols[:-1]:
display(pd.crosstab(data['Attrition_Flag'],data[col], normalize='columns'))
| Total_Relationship_Count | 1 | 2 | 3 | 4 | 5 | 6 |
|---|---|---|---|---|---|---|
| Attrition_Flag | ||||||
| Attrited Customer | 0.256044 | 0.278359 | 0.173536 | 0.117678 | 0.120042 | 0.105038 |
| Existing Customer | 0.743956 | 0.721641 | 0.826464 | 0.882322 | 0.879958 | 0.894962 |
| Dependent_count | 0 | 1 | 2 | 3 | 4 | 5 |
|---|---|---|---|---|---|---|
| Attrition_Flag | ||||||
| Attrited Customer | 0.149336 | 0.146355 | 0.157062 | 0.176428 | 0.165184 | 0.150943 |
| Existing Customer | 0.850664 | 0.853645 | 0.842938 | 0.823572 | 0.834816 | 0.849057 |
| Months_Inactive_12_mon | 0 | 1 | 2 | 3 | 4 | 5 | 6 |
|---|---|---|---|---|---|---|---|
| Attrition_Flag | |||||||
| Attrited Customer | 0.517241 | 0.044783 | 0.15387 | 0.214769 | 0.298851 | 0.179775 | 0.153226 |
| Existing Customer | 0.482759 | 0.955217 | 0.84613 | 0.785231 | 0.701149 | 0.820225 | 0.846774 |
| Contacts_Count_12_mon | 0 | 1 | 2 | 3 | 4 | 5 | 6 |
|---|---|---|---|---|---|---|---|
| Attrition_Flag | |||||||
| Attrited Customer | 0.017544 | 0.072048 | 0.124884 | 0.201479 | 0.226293 | 0.335227 | 1.0 |
| Existing Customer | 0.982456 | 0.927952 | 0.875116 | 0.798521 | 0.773707 | 0.664773 | 0.0 |
| Card_Category | Blue | Gold | Platinum | Silver |
|---|---|---|---|---|
| Attrition_Flag | ||||
| Attrited Customer | 0.160979 | 0.181034 | 0.25 | 0.147748 |
| Existing Customer | 0.839021 | 0.818966 | 0.75 | 0.852252 |
| Income_Category | $120K + | $40K - $60K | $60K - $80K | $80K - $120K | Less than $40K | Unknown |
|---|---|---|---|---|---|---|
| Attrition_Flag | ||||||
| Attrited Customer | 0.173315 | 0.151397 | 0.134807 | 0.157655 | 0.171862 | 0.168165 |
| Existing Customer | 0.826685 | 0.848603 | 0.865193 | 0.842345 | 0.828138 | 0.831835 |
| Marital_Status | Divorced | Married | Single | Unknown |
|---|---|---|---|---|
| Attrition_Flag | ||||
| Attrited Customer | 0.161765 | 0.151269 | 0.169414 | 0.17223 |
| Existing Customer | 0.838235 | 0.848731 | 0.830586 | 0.82777 |
| Education_Level | College | Doctorate | Graduate | High School | Post-Graduate | Uneducated | Unknown |
|---|---|---|---|---|---|---|---|
| Attrition_Flag | |||||||
| Attrited Customer | 0.152024 | 0.210643 | 0.155691 | 0.152012 | 0.178295 | 0.159381 | 0.168532 |
| Existing Customer | 0.847976 | 0.789357 | 0.844309 | 0.847988 | 0.821705 | 0.840619 | 0.831468 |
| Gender | F | M |
|---|---|---|
| Attrition_Flag | ||
| Attrited Customer | 0.173572 | 0.146152 |
| Existing Customer | 0.826428 | 0.853848 |
#fig, ax = plt.subplots(5,2,figsize=(20,15), sharex=True)
for i,col in enumerate(categorical_cols[:-1]):
print(col)
stacked_plot(col, 'Attrition_Flag', working_df)
#sns.barplot(x=means_by_target.index, y=means_by_target[col], ax=ax[i%5,i%2])
Total_Relationship_Count
Dependent_count
Months_Inactive_12_mon
Contacts_Count_12_mon
Card_Category
Income_Category
Marital_Status
Education_Level
Gender
#Checking relationships of other features with Card Category
x = data.groupby('Card_Category').mean()
x.index = ['0Blue', '2Gold' , '3Platinum', '1Silver']
x.sort_index()
| Customer_Age | Dependent_count | Months_on_book | Total_Relationship_Count | Months_Inactive_12_mon | Contacts_Count_12_mon | Credit_Limit | Total_Revolving_Bal | Avg_Open_To_Buy | Total_Amt_Chng_Q4_Q1 | Total_Trans_Amt | Total_Trans_Ct | Total_Ct_Chng_Q4_Q1 | Avg_Utilization_Ratio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0Blue | 46.372404 | 2.336477 | 35.960682 | 3.848665 | 2.345697 | 2.455596 | 7363.780002 | 1157.812209 | 6205.967794 | 0.759332 | 4225.406740 | 64.028190 | 0.710779 | 0.290859 |
| 1Silver | 45.679279 | 2.439640 | 35.452252 | 3.421622 | 2.273874 | 2.450450 | 25277.836036 | 1206.129730 | 24071.706306 | 0.762326 | 6590.482883 | 74.700901 | 0.707281 | 0.057310 |
| 2Gold | 45.439655 | 2.672414 | 35.525862 | 3.008621 | 2.310345 | 2.448276 | 28416.370690 | 1344.318966 | 27072.051724 | 0.772603 | 7685.612069 | 81.517241 | 0.706190 | 0.057103 |
| 3Platinum | 47.500000 | 2.450000 | 36.250000 | 2.300000 | 2.250000 | 2.500000 | 30283.450000 | 1267.950000 | 29015.500000 | 0.780100 | 8999.750000 | 86.950000 | 0.666200 | 0.043650 |
#Setting dtype as category for the categorical columns
for col in categorical_cols:
working_df[col] = working_df[col].astype('category')
cols_to_drop = ['Avg_Open_To_Buy'] #dropping as discussed before - high correlation with other features
model_df = working_df.drop(cols_to_drop, axis =1)
#Preparing Xand y for model
X = model_df.drop(columns='Attrition_Flag')
Y = model_df['Attrition_Flag'].astype('float64')
X_train_temp, X_test_temp, y_train, y_test = train_test_split(
X, Y, test_size=0.30, random_state=random_state, stratify=Y)
print(X_train_temp.shape, X_test_temp.shape)
(7088, 18) (3039, 18)
X_train_temp.head()
| Customer_Age | Gender | Dependent_count | Education_Level | Marital_Status | Income_Category | Card_Category | Months_on_book | Total_Relationship_Count | Months_Inactive_12_mon | Contacts_Count_12_mon | Credit_Limit | Total_Revolving_Bal | Total_Amt_Chng_Q4_Q1 | Total_Trans_Amt | Total_Trans_Ct | Total_Ct_Chng_Q4_Q1 | Avg_Utilization_Ratio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 7540 | 53 | 1 | 3 | 3 | 1 | 0 | 0 | 36 | 6 | 4 | 0 | 3319.0 | 913 | 0.612 | 4560 | 84 | 1.049 | 0.275 |
| 2920 | 52 | 0 | 2 | 2 | 1 | 3 | 0 | 36 | 6 | 3 | 4 | 2803.0 | 778 | 0.550 | 3142 | 64 | 0.561 | 0.278 |
| 6475 | 55 | 1 | 2 | NaN | 0 | 0 | 0 | 36 | 3 | 3 | 2 | 1886.0 | 1023 | 0.708 | 4233 | 81 | 0.884 | 0.542 |
| 1012 | 55 | 0 | 1 | 3 | 0 | 4 | 0 | 42 | 4 | 2 | 3 | 34516.0 | 1478 | 0.967 | 1308 | 41 | 0.414 | 0.043 |
| 6723 | 50 | 1 | 4 | 0 | 0 | 0 | 0 | 43 | 4 | 3 | 2 | 2835.0 | 2292 | 0.732 | 4686 | 79 | 0.756 | 0.808 |
display(y_test.value_counts(normalize=True))
display(y_train.value_counts(normalize=True))
0.0 0.839421 1.0 0.160579 Name: Attrition_Flag, dtype: float64
0.0 0.839306 1.0 0.160694 Name: Attrition_Flag, dtype: float64
imputer = KNNImputer(n_neighbors=5)
#Fit and transform the train data
X_train_temp = pd.DataFrame(imputer.fit_transform(X_train_temp),columns=X_train_temp.columns )
#Transform the test data
X_test_temp=pd.DataFrame(imputer.transform(X_test_temp),columns=X_test_temp.columns )
#Checking that no column has missing values in train or test sets
print(X_train_temp.isna().sum())
print('-'*30)
print(X_test_temp.isna().sum())
Customer_Age 0 Gender 0 Dependent_count 0 Education_Level 0 Marital_Status 0 Income_Category 0 Card_Category 0 Months_on_book 0 Total_Relationship_Count 0 Months_Inactive_12_mon 0 Contacts_Count_12_mon 0 Credit_Limit 0 Total_Revolving_Bal 0 Total_Amt_Chng_Q4_Q1 0 Total_Trans_Amt 0 Total_Trans_Ct 0 Total_Ct_Chng_Q4_Q1 0 Avg_Utilization_Ratio 0 dtype: int64 ------------------------------ Customer_Age 0 Gender 0 Dependent_count 0 Education_Level 0 Marital_Status 0 Income_Category 0 Card_Category 0 Months_on_book 0 Total_Relationship_Count 0 Months_Inactive_12_mon 0 Contacts_Count_12_mon 0 Credit_Limit 0 Total_Revolving_Bal 0 Total_Amt_Chng_Q4_Q1 0 Total_Trans_Amt 0 Total_Trans_Ct 0 Total_Ct_Chng_Q4_Q1 0 Avg_Utilization_Ratio 0 dtype: int64
## Function to inverse the encoding
def inverse_mapping(x,y, df_train, df_test):
inv_dict = {v: k for k, v in x.items()}
print(inv_dict)
df_train[y] = np.round(df_train[y]).map(inv_dict).astype('category')
df_test[y] = np.round(df_test[y]).map(inv_dict).astype('category')
#reverse encoding
X_train = X_train_temp.copy()
X_test = X_test_temp.copy()
inverse_mapping(Gender_d, 'Gender', X_train, X_test)
inverse_mapping(Education_Level_d, 'Education_Level', X_train, X_test)
inverse_mapping(Income_Category_d, 'Income_Category', X_train, X_test)
inverse_mapping(Card_Category_d, 'Card_Category', X_train, X_test)
inverse_mapping(Marital_Status_d, 'Marital_Status', X_train, X_test)
{0: 'M', 1: 'F'}
{0: 'Uneducated', 1: 'High School', 2: 'College', 3: 'Graduate', 4: 'Post-Graduate', 5: 'Doctorate', nan: 'Unknown'}
{0: 'Less than $40K', 1: '$40K - $60K', 2: '$60K - $80K', 3: '$80K - $120K', 4: '$120K +', nan: 'Unknown'}
{0: 'Blue', 1: 'Silver', 2: 'Gold', 3: 'Platinum'}
{0: 'Single', 1: 'Married', 2: 'Divorced', nan: 'Unknown'}
#inspecting X_train
X_train
| Customer_Age | Gender | Dependent_count | Education_Level | Marital_Status | Income_Category | Card_Category | Months_on_book | Total_Relationship_Count | Months_Inactive_12_mon | Contacts_Count_12_mon | Credit_Limit | Total_Revolving_Bal | Total_Amt_Chng_Q4_Q1 | Total_Trans_Amt | Total_Trans_Ct | Total_Ct_Chng_Q4_Q1 | Avg_Utilization_Ratio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 53.0 | F | 3.0 | Graduate | Married | Less than $40K | Blue | 36.0 | 6.0 | 4.0 | 0.0 | 3319.0 | 913.0 | 0.612 | 4560.0 | 84.0 | 1.049 | 0.275 |
| 1 | 52.0 | M | 2.0 | College | Married | $80K - $120K | Blue | 36.0 | 6.0 | 3.0 | 4.0 | 2803.0 | 778.0 | 0.550 | 3142.0 | 64.0 | 0.561 | 0.278 |
| 2 | 55.0 | F | 2.0 | High School | Single | Less than $40K | Blue | 36.0 | 3.0 | 3.0 | 2.0 | 1886.0 | 1023.0 | 0.708 | 4233.0 | 81.0 | 0.884 | 0.542 |
| 3 | 55.0 | M | 1.0 | Graduate | Single | $120K + | Blue | 42.0 | 4.0 | 2.0 | 3.0 | 34516.0 | 1478.0 | 0.967 | 1308.0 | 41.0 | 0.414 | 0.043 |
| 4 | 50.0 | F | 4.0 | Uneducated | Single | Less than $40K | Blue | 43.0 | 4.0 | 3.0 | 2.0 | 2835.0 | 2292.0 | 0.732 | 4686.0 | 79.0 | 0.756 | 0.808 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 7083 | 41.0 | M | 2.0 | Uneducated | Married | $60K - $80K | Blue | 13.0 | 4.0 | 2.0 | 2.0 | 2243.0 | 1944.0 | 0.942 | 3216.0 | 77.0 | 0.674 | 0.867 |
| 7084 | 43.0 | F | 2.0 | College | Married | $40K - $60K | Blue | 36.0 | 1.0 | 2.0 | 6.0 | 3793.0 | 2517.0 | 0.512 | 2577.0 | 57.0 | 0.629 | 0.664 |
| 7085 | 48.0 | F | 5.0 | Post-Graduate | Married | $40K - $60K | Blue | 38.0 | 3.0 | 3.0 | 1.0 | 2926.0 | 716.0 | 0.949 | 4219.0 | 78.0 | 0.592 | 0.245 |
| 7086 | 40.0 | M | 4.0 | High School | Married | $80K - $120K | Blue | 36.0 | 6.0 | 1.0 | 0.0 | 2126.0 | 1321.0 | 0.596 | 2992.0 | 72.0 | 1.000 | 0.621 |
| 7087 | 49.0 | M | 2.0 | High School | Married | $60K - $80K | Blue | 38.0 | 5.0 | 2.0 | 1.0 | 5536.0 | 756.0 | 0.635 | 3918.0 | 70.0 | 0.628 | 0.137 |
7088 rows × 18 columns
#Performing one hot encoding
X_train=pd.get_dummies(X_train,drop_first=True)
X_test=pd.get_dummies(X_test,drop_first=True)
#print(X_train.shape, X_test.shape)
print(f'Training data has {X_train.shape[0]} rows and {X_train.shape[1]} features')
print(f'Testing data has {X_test.shape[0]} rows and {X_test.shape[1]} features')
Training data has 7088 rows and 28 features Testing data has 3039 rows and 28 features
#Inspecting after one hote encoding
X_train
| Customer_Age | Dependent_count | Months_on_book | Total_Relationship_Count | Months_Inactive_12_mon | Contacts_Count_12_mon | Credit_Limit | Total_Revolving_Bal | Total_Amt_Chng_Q4_Q1 | Total_Trans_Amt | Total_Trans_Ct | Total_Ct_Chng_Q4_Q1 | Avg_Utilization_Ratio | Gender_M | Education_Level_Doctorate | Education_Level_Graduate | Education_Level_High School | Education_Level_Post-Graduate | Education_Level_Uneducated | Marital_Status_Married | Marital_Status_Single | Income_Category_$40K - $60K | Income_Category_$60K - $80K | Income_Category_$80K - $120K | Income_Category_Less than $40K | Card_Category_Gold | Card_Category_Platinum | Card_Category_Silver | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 53.0 | 3.0 | 36.0 | 6.0 | 4.0 | 0.0 | 3319.0 | 913.0 | 0.612 | 4560.0 | 84.0 | 1.049 | 0.275 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 1 | 52.0 | 2.0 | 36.0 | 6.0 | 3.0 | 4.0 | 2803.0 | 778.0 | 0.550 | 3142.0 | 64.0 | 0.561 | 0.278 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 2 | 55.0 | 2.0 | 36.0 | 3.0 | 3.0 | 2.0 | 1886.0 | 1023.0 | 0.708 | 4233.0 | 81.0 | 0.884 | 0.542 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 3 | 55.0 | 1.0 | 42.0 | 4.0 | 2.0 | 3.0 | 34516.0 | 1478.0 | 0.967 | 1308.0 | 41.0 | 0.414 | 0.043 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 4 | 50.0 | 4.0 | 43.0 | 4.0 | 3.0 | 2.0 | 2835.0 | 2292.0 | 0.732 | 4686.0 | 79.0 | 0.756 | 0.808 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 7083 | 41.0 | 2.0 | 13.0 | 4.0 | 2.0 | 2.0 | 2243.0 | 1944.0 | 0.942 | 3216.0 | 77.0 | 0.674 | 0.867 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
| 7084 | 43.0 | 2.0 | 36.0 | 1.0 | 2.0 | 6.0 | 3793.0 | 2517.0 | 0.512 | 2577.0 | 57.0 | 0.629 | 0.664 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
| 7085 | 48.0 | 5.0 | 38.0 | 3.0 | 3.0 | 1.0 | 2926.0 | 716.0 | 0.949 | 4219.0 | 78.0 | 0.592 | 0.245 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
| 7086 | 40.0 | 4.0 | 36.0 | 6.0 | 1.0 | 0.0 | 2126.0 | 1321.0 | 0.596 | 2992.0 | 72.0 | 1.000 | 0.621 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 7087 | 49.0 | 2.0 | 38.0 | 5.0 | 2.0 | 1.0 | 5536.0 | 756.0 | 0.635 | 3918.0 | 70.0 | 0.628 | 0.137 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
7088 rows × 28 columns
y_train
#Defining logistic model and fitting to train data
lr_max_iter = 5000 #setting max iterations for logistic models
lr = LogisticRegression(random_state=random_state, max_iter=lr_max_iter)
lr.fit(X_train, y_train)
LogisticRegression(max_iter=5000, random_state=314159)
#Preparing scores for both train and test data
lr.score(X_train, y_train)
score_df = model_score_df(lr,X_train, y_train, X_test, y_test, 'LR')
score_df
| LR-train | LR-test | |
|---|---|---|
| model_accuracy | 0.896868 | 0.891412 |
| model_recall_sensitivity_TPR | 0.529412 | 0.536885 |
| model_precision | 0.755639 | 0.715847 |
| Specificity_TNR | 0.967221 | 0.959232 |
| model_f1 | 0.622612 | 0.613583 |
| FPR | 0.032779 | 0.040768 |
#plotting confusion matrix
make_confusion_matrix(lr, y_test)
#Performing kfolds on model and storing the scores in a df
kfold_results_df = kfolds(lr, X_train, y_train, 'LR')
kfold_results_df
LogisticRegression(max_iter=5000, random_state=314159)
| LR | |
|---|---|
| 0 | 0.456140 |
| 1 | 0.557018 |
| 2 | 0.552632 |
| 3 | 0.497797 |
| 4 | 0.517544 |
#Plotting all the recall scores for each iteration of kfold
plot_cv_scores(kfold_results_df)
from imblearn.over_sampling import SMOTE
print("Before UpSampling, counts of target =1: {}".format(sum(y_train==1)))
print("Before UpSampling, counts of target =0: {} \n".format(sum(y_train==0)))
sm = SMOTE(sampling_strategy = 1 ,k_neighbors = 5, random_state=random_state) #Synthetic Minority Over Sampling Technique
X_train_over, y_train_over = sm.fit_resample(X_train, y_train)
print("After UpSampling, counts of target =1: {}".format(sum(y_train_over==1)))
print("After UpSampling, counts of target =0: {} \n".format(sum(y_train_over==0)))
print('After UpSampling, the shape of train_X: {}'.format(X_train_over.shape))
print('After UpSampling, the shape of train_y: {} \n'.format(y_train_over.shape))
Before UpSampling, counts of target =1: 1139 Before UpSampling, counts of target =0: 5949 After UpSampling, counts of target =1: 5949 After UpSampling, counts of target =0: 5949 After UpSampling, the shape of train_X: (11898, 28) After UpSampling, the shape of train_y: (11898,)
#Defining the new model, fitting to train data and getting scores on train and test
#using more iterations as kfold reported warning
lr_over = LogisticRegression(random_state = random_state, max_iter=10000)
lr_over.fit(X_train_over,y_train_over)
score_df = model_score_df(lr_over, X_train_over, y_train_over, X_test, y_test, 'LR-over', score_df)
score_df
| LR-train | LR-test | LR-over-train | LR-over-test | |
|---|---|---|---|---|
| model_accuracy | 0.896868 | 0.891412 | 0.889561 | 0.858506 |
| model_recall_sensitivity_TPR | 0.529412 | 0.536885 | 0.881325 | 0.639344 |
| model_precision | 0.755639 | 0.715847 | 0.896086 | 0.551237 |
| Specificity_TNR | 0.967221 | 0.959232 | 0.897798 | 0.900431 |
| model_f1 | 0.622612 | 0.613583 | 0.888644 | 0.592030 |
| FPR | 0.032779 | 0.040768 | 0.102202 | 0.099569 |
#Plotting confusion matrix
make_confusion_matrix(lr_over, y_test)
#performing kfolds on storing scores in a df
kfold_results_df = kfolds(lr_over, X_train_over, y_train_over, 'LR-over', results_df=kfold_results_df)
kfold_results_df
LogisticRegression(max_iter=10000, random_state=314159)
| LR | LR-over | |
|---|---|---|
| 0 | 0.456140 | 0.880672 |
| 1 | 0.557018 | 0.888235 |
| 2 | 0.552632 | 0.879832 |
| 3 | 0.497797 | 0.900757 |
| 4 | 0.517544 | 0.875630 |
#plotting kfold results for all models
plot_cv_scores(kfold_results_df)
#define model with regularisation, fit to train and prepare scores on train and test
lr_over_reg = LogisticRegression(random_state = random_state, max_iter=lr_max_iter, solver='saga',penalty='l1')
# Training the basic logistic regression model with training set
lr_over_reg.fit(X_train_over,y_train_over)
score_df = model_score_df(lr_over_reg, X_train_over, y_train_over, X_test, y_test, 'LR-over-reg', score_df)
score_df
| LR-train | LR-test | LR-over-train | LR-over-test | LR-over-reg-train | LR-over-reg-test | |
|---|---|---|---|---|---|---|
| model_accuracy | 0.896868 | 0.891412 | 0.889561 | 0.858506 | 0.774668 | 0.781836 |
| model_recall_sensitivity_TPR | 0.529412 | 0.536885 | 0.881325 | 0.639344 | 0.761473 | 0.760246 |
| model_precision | 0.755639 | 0.715847 | 0.896086 | 0.551237 | 0.782113 | 0.404580 |
| Specificity_TNR | 0.967221 | 0.959232 | 0.897798 | 0.900431 | 0.787864 | 0.785966 |
| model_f1 | 0.622612 | 0.613583 | 0.888644 | 0.592030 | 0.771655 | 0.528114 |
| FPR | 0.032779 | 0.040768 | 0.102202 | 0.099569 | 0.212136 | 0.214034 |
#plot confusion matrix
make_confusion_matrix(lr_over_reg, y_test)
#Doing kfolds and storing results in a df
kfold_results_df = kfolds(lr_over_reg, X_train_over, y_train_over, 'LR-over-reg', results_df=kfold_results_df)
kfold_results_df
LogisticRegression(max_iter=5000, penalty='l1', random_state=314159,
solver='saga')
| LR | LR-over | LR-over-reg | |
|---|---|---|---|
| 0 | 0.456140 | 0.880672 | 0.747059 |
| 1 | 0.557018 | 0.888235 | 0.740336 |
| 2 | 0.552632 | 0.879832 | 0.771429 |
| 3 | 0.497797 | 0.900757 | 0.753574 |
| 4 | 0.517544 | 0.875630 | 0.774790 |
plot_cv_scores(kfold_results_df)
#perform undersampling on train
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(random_state = random_state)
X_train_un, y_train_un = rus.fit_resample(X_train, y_train)
print("Before Under Sampling, counts of label =1: {}".format(sum(y_train==1)))
print("Before Under Sampling, counts of label =0: {} \n".format(sum(y_train==0)))
print("After Under Sampling, counts of label =1: {}".format(sum(y_train_un==1)))
print("After Under Sampling, counts of label =0: {} \n".format(sum(y_train_un==0)))
print('After Under Sampling, the shape of train_X: {}'.format(X_train_un.shape))
print('After Under Sampling, the shape of train_y: {} \n'.format(y_train_un.shape))
Before Under Sampling, counts of label =1: 1139 Before Under Sampling, counts of label =0: 5949 After Under Sampling, counts of label =1: 1139 After Under Sampling, counts of label =0: 1139 After Under Sampling, the shape of train_X: (2278, 28) After Under Sampling, the shape of train_y: (2278,)
#define model, fit it on train and prepare scores on train and test
lr_under = LogisticRegression(random_state = random_state, max_iter=lr_max_iter)
lr_under.fit(X_train_un,y_train_un )
score_df = model_score_df(lr_under, X_train_un, y_train_un, X_test, y_test, 'LR-under', score_df)
score_df
| LR-train | LR-test | LR-over-train | LR-over-test | LR-over-reg-train | LR-over-reg-test | LR-under-train | LR-under-test | |
|---|---|---|---|---|---|---|---|---|
| model_accuracy | 0.896868 | 0.891412 | 0.889561 | 0.858506 | 0.774668 | 0.781836 | 0.829236 | 0.815729 |
| model_recall_sensitivity_TPR | 0.529412 | 0.536885 | 0.881325 | 0.639344 | 0.761473 | 0.760246 | 0.824407 | 0.836066 |
| model_precision | 0.755639 | 0.715847 | 0.896086 | 0.551237 | 0.782113 | 0.404580 | 0.832447 | 0.459459 |
| Specificity_TNR | 0.967221 | 0.959232 | 0.897798 | 0.900431 | 0.787864 | 0.785966 | 0.834065 | 0.811838 |
| model_f1 | 0.622612 | 0.613583 | 0.888644 | 0.592030 | 0.771655 | 0.528114 | 0.828408 | 0.593023 |
| FPR | 0.032779 | 0.040768 | 0.102202 | 0.099569 | 0.212136 | 0.214034 | 0.165935 | 0.188162 |
#plot confusion matrix
make_confusion_matrix(lr_under, y_test)
#Perform kfolds and store scores in a df
kfold_results_df = kfolds(lr_under, X_train_un, y_train_un, 'LR-under', results_df=kfold_results_df)
kfold_results_df
LogisticRegression(max_iter=5000, random_state=314159)
| LR | LR-over | LR-over-reg | LR-under | |
|---|---|---|---|---|
| 0 | 0.456140 | 0.880672 | 0.747059 | 0.771930 |
| 1 | 0.557018 | 0.888235 | 0.740336 | 0.828947 |
| 2 | 0.552632 | 0.879832 | 0.771429 | 0.850877 |
| 3 | 0.497797 | 0.900757 | 0.753574 | 0.845815 |
| 4 | 0.517544 | 0.875630 | 0.774790 | 0.807018 |
plot_cv_scores(kfold_results_df)
#creating a dictionary of pipelines with various models as below
models = {
"LR":
Pipeline(
steps=[
("scaler", StandardScaler()),
("lr", LogisticRegression(random_state=random_state)),
]),
"DT":
Pipeline(
steps=[
("scaler", StandardScaler()),
("dt", DecisionTreeClassifier(random_state=random_state)),
]),
"RF":
Pipeline(
steps=[
("scaler", StandardScaler()),
("rf", RandomForestClassifier(random_state=random_state)),
]),
"BG":
Pipeline(
steps=[
("scaler", StandardScaler()),
("bg", BaggingClassifier(random_state=random_state)),
]),
"GBM":
Pipeline(
steps=[
("scaler", StandardScaler()),
("gb", GradientBoostingClassifier(random_state=random_state)),
]),
"ADB":
Pipeline(
steps=[
("scaler", StandardScaler()),
("ad", AdaBoostClassifier(random_state=random_state)),
]),
"XGB":
Pipeline(
steps=[
("scaler", StandardScaler()),
("xg", XGBClassifier(random_state=random_state,eval_metric='logloss', use_label_encoder=False)),
])
}
%%time
#Running base models without kfold
score_df_step2 = pd.DataFrame()
for i,key in enumerate(models):
model = models[key]
model.fit(X_train, y_train)
if i == 0:
score_df_step2 = model_score_df(model, X_train, y_train, X_test, y_test, label=key, score_df = None)
else:
score_df_step2 = model_score_df(model, X_train, y_train, X_test, y_test, label=key, score_df = score_df_step2)
score_df_step2
CPU times: user 8.83 s, sys: 306 ms, total: 9.14 s Wall time: 3.27 s
| LR-train | LR-test | DT-train | DT-test | RF-train | RF-test | BG-train | BG-test | GBM-train | GBM-test | ADB-train | ADB-test | XGB-train | XGB-test | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| model_accuracy | 0.904769 | 0.902270 | 1.0 | 0.932873 | 1.0 | 0.958210 | 0.996755 | 0.955577 | 0.975875 | 0.967753 | 0.963177 | 0.959526 | 1.0 | 0.969398 |
| model_recall_sensitivity_TPR | 0.589991 | 0.594262 | 1.0 | 0.795082 | 1.0 | 0.795082 | 0.981563 | 0.823770 | 0.886743 | 0.860656 | 0.863038 | 0.854508 | 1.0 | 0.875000 |
| model_precision | 0.763636 | 0.745501 | 1.0 | 0.788618 | 1.0 | 0.934940 | 0.998214 | 0.891353 | 0.960076 | 0.933333 | 0.903493 | 0.889126 | 1.0 | 0.930283 |
| Specificity_TNR | 0.965036 | 0.961192 | 1.0 | 0.959232 | 1.0 | 0.989416 | 0.999664 | 0.980792 | 0.992940 | 0.988240 | 0.982350 | 0.979616 | 1.0 | 0.987456 |
| model_f1 | 0.665676 | 0.661345 | 1.0 | 0.791837 | 1.0 | 0.859358 | 0.989819 | 0.856230 | 0.921953 | 0.895522 | 0.882802 | 0.871473 | 1.0 | 0.901795 |
| FPR | 0.034964 | 0.038808 | 0.0 | 0.040768 | 0.0 | 0.010584 | 0.000336 | 0.019208 | 0.007060 | 0.011760 | 0.017650 | 0.020384 | 0.0 | 0.012544 |
%%time
kfold_results_step2 = pd.DataFrame()
for i,key in enumerate(models):
model = models[key]
if i==0:
kfold_results_step2 = kfolds(model, X_train, y_train, key, results_df=None)
else:
kfold_results_step2 = kfolds(model, X_train, y_train, key, results_df=kfold_results_step2)
kfold_results_step2
Pipeline(steps=[('scaler', StandardScaler()),
('lr', LogisticRegression(random_state=314159))])
Pipeline(steps=[('scaler', StandardScaler()),
('dt', DecisionTreeClassifier(random_state=314159))])
Pipeline(steps=[('scaler', StandardScaler()),
('rf', RandomForestClassifier(random_state=314159))])
Pipeline(steps=[('scaler', StandardScaler()),
('bg', BaggingClassifier(random_state=314159))])
Pipeline(steps=[('scaler', StandardScaler()),
('gb', GradientBoostingClassifier(random_state=314159))])
Pipeline(steps=[('scaler', StandardScaler()),
('ad', AdaBoostClassifier(random_state=314159))])
Pipeline(steps=[('scaler', StandardScaler()),
('xg',
XGBClassifier(base_score=0.5, booster='gbtree',
colsample_bylevel=1, colsample_bynode=1,
colsample_bytree=1, eval_metric='logloss',
gamma=0, gpu_id=-1, importance_type='gain',
interaction_constraints='',
learning_rate=0.300000012, max_delta_step=0,
max_depth=6, min_child_weight=1, missing=nan,
monotone_constraints='()', n_estimators=100,
n_jobs=16, num_parallel_tree=1,
random_state=314159, reg_alpha=0, reg_lambda=1,
scale_pos_weight=1, subsample=1,
tree_method='exact', use_label_encoder=False,
validate_parameters=1, verbosity=None))])
CPU times: user 32.4 s, sys: 1.35 s, total: 33.7 s
Wall time: 11.7 s
| LR | DT | RF | BG | GBM | ADB | XGB | |
|---|---|---|---|---|---|---|---|
| 0 | 0.565789 | 0.824561 | 0.793860 | 0.833333 | 0.850877 | 0.846491 | 0.877193 |
| 1 | 0.600877 | 0.767544 | 0.820175 | 0.842105 | 0.833333 | 0.824561 | 0.885965 |
| 2 | 0.592105 | 0.789474 | 0.793860 | 0.798246 | 0.807018 | 0.789474 | 0.885965 |
| 3 | 0.568282 | 0.770925 | 0.775330 | 0.792952 | 0.850220 | 0.845815 | 0.863436 |
| 4 | 0.570175 | 0.767544 | 0.745614 | 0.776316 | 0.837719 | 0.833333 | 0.877193 |
plt.figure(figsize=(15,10))
plt.suptitle('Comparing Recall Scores')
sns.boxplot(data=kfold_results_step2.melt(), y='value', x = 'variable')
<AxesSubplot:xlabel='variable', ylabel='value'>
# Defining parameters together for convenience
# For some of these we define separate parameters for grid search as the exhaustive random search
# params take too long to run in grid search
parameters = {
"LR": {
'lr__penalty': ['l1','l2'],
'lr__C': [0.1,0.2,0.5,0.7,1.0],
'lr__class_weight':['balanced',None],
'lr__solver': ['saga'],
'lr__max_iter': [1000,5000,10000]
},
"DT": {
'dt__max_depth': np.arange(20,30),
'dt__min_samples_leaf': [1, 2, 5, 7, 10],
'dt__max_leaf_nodes' : [2, 3, 5, 10,15],
'dt__min_impurity_decrease': [0.00001,0.0001,0.001,0.01,0.1],
'dt__class_weight':['balanced', None]
},
"RF-grid": {
'rf__max_features': ['sqrt','log2','auto'],
'rf__min_samples_leaf': np.arange(1,15,5),
'rf__min_samples_split': np.arange(2, 20, 5),
'rf__n_estimators': [50,100,120,140,150],
'rf__class_weight':['balanced']
},
"RF": {
'rf__max_features': ['sqrt','log2','auto'],
'rf__min_samples_leaf': np.arange(1,15,5),
'rf__min_samples_split': np.arange(2, 20, 5),
'rf__n_estimators': list(np.linspace(50, 150, 10, dtype = int)),
'rf__class_weight':['balanced',None]
},
"BG": {
'bg__base_estimator': [
DecisionTreeClassifier(max_depth=1, random_state=random_state),
DecisionTreeClassifier(max_depth=2, random_state=random_state),
DecisionTreeClassifier(max_depth=3, random_state=random_state)],
'bg__n_estimators':[5,7,15,51,77,101],
'bg__max_features': [0.7,0.8,0.9,1]
},
"GBM-grid": {
'gb__n_estimators': [250,300],
'gb__subsample':[0.7,0.8,0.9,1],
'gb__max_features':[0.3,0.5,'auto'],
'gb__max_depth': [3,5,7]
},
"GBM": {
'gb__n_estimators': [150,200,250,300],
'gb__subsample':[0.7,0.8,0.9,1],
'gb__max_features':[0.3,0.5,'auto'],
'gb__max_depth': [3,5,7,10]
},
"ADB": {
'ad__n_estimators': np.arange(10, 110, 10),
'ad__learning_rate': [0.1, 0.01, 0.2, 0.05, 1],
'ad__base_estimator': [
DecisionTreeClassifier(max_depth=1, random_state=random_state),
DecisionTreeClassifier(max_depth=2, random_state=random_state),
DecisionTreeClassifier(max_depth=3, random_state=random_state)]
},
"XGB-grid": {
'xg__n_estimators': [50,100],
'xg__scale_pos_weight': [1,target_weights], #using target_weights as computed earlier and as per xgb tutorial linked below
'xg__learning_rate': [0.1,0.2],
'xg__gamma': [0,30,50],
'xg__subsample': [0.8,1],
'xg__colsample_bytree': [0.5,0.7],
'xg__max_depth': [1,3,6],
'xg__tree_method': ['hist'], #per advice in tutorial linked below
'xg__min_child_weight': [0,1],
#'xg__colsample_bylevel': [0.5,0.7,1]
},
"XGB": {
'xg__n_estimators': np.arange(10,150,10),
'xg__scale_pos_weight': [1,target_weights], #using target_weights as computed earlier and as per xgb tutorial linked below
'xg__learning_rate': [0.05,0.1,0.2,0.3],
'xg__gamma': np.arange(0,150,10),
'xg__subsample': [0.5,0.6,0.7,0.8,0.9,1],
'xg__colsample_bytree': [0.1,0.2,0.3,0.5,0.7,1],
'xg__max_depth': [3,5,6,8,10,12,15],
'xg__tree_method': ['hist'], #per advice in tutorial linked below
'xg__min_child_weight': [0,3,5,7,8,9,10],
'xg__colsample_bylevel': [0.5,0.7,1]
}
}
#using this for xgboost: https://xgboost.readthedocs.io/en/latest/tutorials/param_tuning.html
%%time
if run_grid_search:
pipe = models['LR']
param_grid = parameters['LR']
# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.recall_score)
# Calling GridSearchCV
grid_cv = GridSearchCV(estimator=pipe, param_grid=param_grid, scoring=scorer, cv=5)
# Fitting parameters in GridSeachCV
grid_cv.fit(X_train, y_train)
lr_tuned_grid = grid_cv.best_estimator_
print(
"Best Parameters:{} \nScore: {}".format(grid_cv.best_params_, grid_cv.best_score_)
)
print(lr_tuned_grid)
Best Parameters:{'lr__C': 0.1, 'lr__class_weight': 'balanced', 'lr__max_iter': 1000, 'lr__penalty': 'l1', 'lr__solver': 'saga'}
Score: 0.8480987711569673
Pipeline(steps=[('scaler', StandardScaler()),
('lr',
LogisticRegression(C=0.1, class_weight='balanced',
max_iter=1000, penalty='l1',
random_state=314159, solver='saga'))])
CPU times: user 2min 43s, sys: 29.1 s, total: 3min 13s
Wall time: 24.6 s
%%time
pipe = models['LR']
param_grid = parameters['LR']
# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.recall_score)
#Calling RandomizedSearchCV
cv_obj = RandomizedSearchCV(estimator=pipe, param_distributions=param_grid, n_iter=50, scoring=scorer, cv=5, random_state=random_state)
#Fitting parameters in RandomizedSearchCV
cv_obj.fit(X_train,y_train)
lr_tuned_rand = cv_obj.best_estimator_
print("Best parameters are {} with CV score={}:" .format(cv_obj.best_params_, cv_obj.best_score_))
print(lr_tuned_rand)
Best parameters are {'lr__solver': 'saga', 'lr__penalty': 'l1', 'lr__max_iter': 10000, 'lr__class_weight': 'balanced', 'lr__C': 0.1} with CV score=0.8480987711569673:
Pipeline(steps=[('scaler', StandardScaler()),
('lr',
LogisticRegression(C=0.1, class_weight='balanced',
max_iter=10000, penalty='l1',
random_state=314159, solver='saga'))])
CPU times: user 2min 21s, sys: 27 s, total: 2min 48s
Wall time: 21.6 s
rand_cv_scores_df = make_cv_scores_df(cv_obj, label='LR')
rand_cv_scores_df.head()
| LR | |
|---|---|
| 0 | 0.563649 |
| 1 | 0.846344 |
| 2 | 0.846348 |
| 3 | 0.578565 |
| 4 | 0.569785 |
make_confusion_matrix(lr_tuned_rand, y_test)
%%time
if run_grid_search:
pipe = models['LR']
param_grid = parameters['LR']
# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.recall_score)
# Calling GridSearchCV
grid_cv = GridSearchCV(estimator=pipe, param_grid=param_grid, scoring=scorer, cv=5)
# Fitting parameters in GridSeachCV
grid_cv.fit(X_train_over, y_train_over)
lr_over_tuned_grid = grid_cv.best_estimator_
print(
"Best Parameters:{} \nScore: {}".format(grid_cv.best_params_, grid_cv.best_score_)
)
print(lr_over_tuned_grid)
Best Parameters:{'lr__C': 0.1, 'lr__class_weight': 'balanced', 'lr__max_iter': 1000, 'lr__penalty': 'l1', 'lr__solver': 'saga'}
Score: 0.8776416874571528
Pipeline(steps=[('scaler', StandardScaler()),
('lr',
LogisticRegression(C=0.1, class_weight='balanced',
max_iter=1000, penalty='l1',
random_state=314159, solver='saga'))])
CPU times: user 4min 3s, sys: 48.6 s, total: 4min 51s
Wall time: 54.9 s
%%time
pipe = models['LR']
param_grid = parameters['LR']
# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.recall_score)
#Calling RandomizedSearchCV
cv_obj = RandomizedSearchCV(estimator=pipe, param_distributions=param_grid, n_iter=50, scoring=scorer, cv=5, random_state=random_state)
#Fitting parameters in RandomizedSearchCV
cv_obj.fit(X_train_over,y_train_over)
lr_over_tuned_rand = cv_obj.best_estimator_
print("Best parameters are {} with CV score={}:" .format(cv_obj.best_params_, cv_obj.best_score_))
print(lr_over_tuned_rand)
Best parameters are {'lr__solver': 'saga', 'lr__penalty': 'l1', 'lr__max_iter': 10000, 'lr__class_weight': None, 'lr__C': 0.1} with CV score=0.8776416874571528:
Pipeline(steps=[('scaler', StandardScaler()),
('lr',
LogisticRegression(C=0.1, max_iter=10000, penalty='l1',
random_state=314159, solver='saga'))])
CPU times: user 3min 21s, sys: 45.3 s, total: 4min 6s
Wall time: 50.3 s
rand_cv_scores_df = make_cv_scores_df(cv_obj, label='LR-over', df=rand_cv_scores_df)
rand_cv_scores_df.head()
| LR | LR-over | |
|---|---|---|
| 0 | 0.563649 | 0.877642 |
| 1 | 0.846344 | 0.877137 |
| 2 | 0.846348 | 0.876801 |
| 3 | 0.578565 | 0.876801 |
| 4 | 0.569785 | 0.876465 |
make_confusion_matrix(lr_over_tuned_rand, y_test)
%%time
if run_grid_search:
pipe = models['LR']
param_grid = parameters['LR']
# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.recall_score)
# Calling GridSearchCV
grid_cv = GridSearchCV(estimator=pipe, param_grid=param_grid, scoring=scorer, cv=5)
# Fitting parameters in GridSeachCV
grid_cv.fit(X_train_un, y_train_un)
lr_under_tuned_grid = grid_cv.best_estimator_
print(
"Best Parameters:{} \nScore: {}".format(grid_cv.best_params_, grid_cv.best_score_)
)
print(lr_over_tuned_grid)
Best Parameters:{'lr__C': 0.1, 'lr__class_weight': 'balanced', 'lr__max_iter': 1000, 'lr__penalty': 'l1', 'lr__solver': 'saga'}
Score: 0.8542507148929592
Pipeline(steps=[('scaler', StandardScaler()),
('lr',
LogisticRegression(C=0.1, class_weight='balanced',
max_iter=1000, penalty='l1',
random_state=314159, solver='saga'))])
CPU times: user 2min 5s, sys: 23.5 s, total: 2min 29s
Wall time: 18.9 s
%%time
pipe = models['LR']
param_grid = parameters['LR']
# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.recall_score)
#Calling RandomizedSearchCV
cv_obj = RandomizedSearchCV(estimator=pipe, param_distributions=param_grid, n_iter=50, scoring=scorer, cv=5, random_state=random_state)
#Fitting parameters in RandomizedSearchCV
cv_obj.fit(X_train_un,y_train_un)
lr_under_tuned_rand = cv_obj.best_estimator_
print("Best parameters are {} with CV score={}:" .format(cv_obj.best_params_, cv_obj.best_score_))
print(lr_under_tuned_rand)
Best parameters are {'lr__solver': 'saga', 'lr__penalty': 'l1', 'lr__max_iter': 10000, 'lr__class_weight': None, 'lr__C': 0.1} with CV score=0.8542507148929592:
Pipeline(steps=[('scaler', StandardScaler()),
('lr',
LogisticRegression(C=0.1, max_iter=10000, penalty='l1',
random_state=314159, solver='saga'))])
CPU times: user 2min 1s, sys: 30.8 s, total: 2min 31s
Wall time: 19.6 s
rand_cv_scores_df = make_cv_scores_df(cv_obj, label='LR-under', df=rand_cv_scores_df)
rand_cv_scores_df.head()
| LR | LR-over | LR-under | |
|---|---|---|---|
| 0 | 0.563649 | 0.877642 | 0.854251 |
| 1 | 0.846344 | 0.877137 | 0.848988 |
| 2 | 0.846348 | 0.876801 | 0.848114 |
| 3 | 0.578565 | 0.876801 | 0.849865 |
| 4 | 0.569785 | 0.876465 | 0.845483 |
make_confusion_matrix(lr_under_tuned_rand, y_test)
plot_cv_scores(rand_cv_scores_df)
%%time
if run_grid_search:
pipe = models['DT']
param_grid = parameters['DT']
# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.recall_score)
# Calling GridSearchCV
grid_cv = GridSearchCV(estimator=pipe, param_grid=param_grid, scoring=scorer, cv=5)
# Fitting parameters in GridSeachCV
grid_cv.fit(X_train, y_train)
dt_tuned_grid = grid_cv.best_estimator_
print(
"Best Parameters:{} \nScore: {}".format(grid_cv.best_params_, grid_cv.best_score_)
)
print(dt_tuned_grid)
Best Parameters:{'dt__class_weight': 'balanced', 'dt__max_depth': 20, 'dt__max_leaf_nodes': 5, 'dt__min_impurity_decrease': 1e-05, 'dt__min_samples_leaf': 1}
Score: 0.9569518509931216
Pipeline(steps=[('scaler', StandardScaler()),
('dt',
DecisionTreeClassifier(class_weight='balanced', max_depth=20,
max_leaf_nodes=5,
min_impurity_decrease=1e-05,
random_state=314159))])
CPU times: user 4min, sys: 534 ms, total: 4min 1s
Wall time: 4min 1s
%%time
pipe = models['DT']
param_grid = parameters['DT']
# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.recall_score)
#Calling RandomizedSearchCV
cv_obj = RandomizedSearchCV(estimator=pipe, param_distributions=param_grid, n_iter=50, scoring=scorer, cv=5, random_state=random_state)
#Fitting parameters in RandomizedSearchCV
cv_obj.fit(X_train,y_train)
dt_tuned_rand = cv_obj.best_estimator_
print("Best parameters are {} with CV score={}:" .format(cv_obj.best_params_, cv_obj.best_score_))
print(dt_tuned_rand)
Best parameters are {'dt__min_samples_leaf': 7, 'dt__min_impurity_decrease': 0.001, 'dt__max_leaf_nodes': 5, 'dt__max_depth': 21, 'dt__class_weight': 'balanced'} with CV score=0.9569518509931216:
Pipeline(steps=[('scaler', StandardScaler()),
('dt',
DecisionTreeClassifier(class_weight='balanced', max_depth=21,
max_leaf_nodes=5,
min_impurity_decrease=0.001,
min_samples_leaf=7,
random_state=314159))])
CPU times: user 5 s, sys: 12.3 ms, total: 5.01 s
Wall time: 5.02 s
rand_cv_scores_df = make_cv_scores_df(cv_obj, label='DT', df=rand_cv_scores_df)
plot_cv_scores(rand_cv_scores_df)
%%time
if run_grid_search:
pipe = models['RF']
param_grid = parameters['RF-grid']
# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.recall_score)
# Calling GridSearchCV
grid_cv = GridSearchCV(estimator=pipe, param_grid=param_grid, scoring=scorer, cv=5)
# Fitting parameters in GridSeachCV
grid_cv.fit(X_train, y_train)
rf_tuned_grid = grid_cv.best_estimator_
print(
"Best Parameters:{} \nScore: {}".format(grid_cv.best_params_, grid_cv.best_score_)
)
print(rf_tuned_grid)
Best Parameters:{'rf__class_weight': 'balanced', 'rf__max_features': 'sqrt', 'rf__min_samples_leaf': 11, 'rf__min_samples_split': 2, 'rf__n_estimators': 150}
Score: 0.905170415024345
Pipeline(steps=[('scaler', StandardScaler()),
('rf',
RandomForestClassifier(class_weight='balanced',
max_features='sqrt',
min_samples_leaf=11, n_estimators=150,
random_state=314159))])
CPU times: user 7min 40s, sys: 1.46 s, total: 7min 41s
Wall time: 7min 42s
%%time
pipe = models['RF']
param_grid = parameters['RF']
# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.recall_score)
#Calling RandomizedSearchCV
cv_obj = RandomizedSearchCV(estimator=pipe, param_distributions=param_grid, n_iter=50, scoring=scorer, cv=5, random_state=random_state)
#Fitting parameters in RandomizedSearchCV
cv_obj.fit(X_train,y_train)
rf_tuned_rand = cv_obj.best_estimator_
print("Best parameters are {} with CV score={}:" .format(cv_obj.best_params_, cv_obj.best_score_))
print(rf_tuned_rand)
Best parameters are {'rf__n_estimators': 138, 'rf__min_samples_split': 17, 'rf__min_samples_leaf': 11, 'rf__max_features': 'sqrt', 'rf__class_weight': 'balanced'} with CV score=0.9051742793106113:
Pipeline(steps=[('scaler', StandardScaler()),
('rf',
RandomForestClassifier(class_weight='balanced',
max_features='sqrt',
min_samples_leaf=11,
min_samples_split=17, n_estimators=138,
random_state=314159))])
CPU times: user 1min 52s, sys: 255 ms, total: 1min 53s
Wall time: 1min 53s
rand_cv_scores_df = make_cv_scores_df(cv_obj, label='RF', df=rand_cv_scores_df)
plot_cv_scores(rand_cv_scores_df)
%%time
if run_grid_search:
pipe = models['BG']
param_grid = parameters['BG']
# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.recall_score)
# Calling GridSearchCV
grid_cv = GridSearchCV(estimator=pipe, param_grid=param_grid, scoring=scorer, cv=5)
# Fitting parameters in GridSeachCV
grid_cv.fit(X_train, y_train)
bg_tuned_grid = grid_cv.best_estimator_
print(
"Best Parameters:{} \nScore: {}".format(grid_cv.best_params_, grid_cv.best_score_)
)
print(bg_tuned_grid)
Best Parameters:{'bg__base_estimator': DecisionTreeClassifier(max_depth=3, random_state=314159), 'bg__max_features': 0.9, 'bg__n_estimators': 15}
Score: 0.7128951232707319
Pipeline(steps=[('scaler', StandardScaler()),
('bg',
BaggingClassifier(base_estimator=DecisionTreeClassifier(max_depth=3,
random_state=314159),
max_features=0.9, n_estimators=15,
random_state=314159))])
CPU times: user 1min 1s, sys: 223 ms, total: 1min 2s
Wall time: 1min 2s
%%time
pipe = models['BG']
param_grid = parameters['BG']
# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.recall_score)
#Calling RandomizedSearchCV
cv_obj = RandomizedSearchCV(estimator=pipe, param_distributions=param_grid, n_iter=50, scoring=scorer, cv=5, random_state=random_state)
#Fitting parameters in RandomizedSearchCV
cv_obj.fit(X_train,y_train)
bg_tuned_rand = cv_obj.best_estimator_
print("Best parameters are {} with CV score={}:" .format(cv_obj.best_params_, cv_obj.best_score_))
print(bg_tuned_rand)
Best parameters are {'bg__n_estimators': 15, 'bg__max_features': 0.9, 'bg__base_estimator': DecisionTreeClassifier(max_depth=3, random_state=314159)} with CV score=0.7128951232707319:
Pipeline(steps=[('scaler', StandardScaler()),
('bg',
BaggingClassifier(base_estimator=DecisionTreeClassifier(max_depth=3,
random_state=314159),
max_features=0.9, n_estimators=15,
random_state=314159))])
CPU times: user 39.8 s, sys: 136 ms, total: 39.9 s
Wall time: 39.9 s
rand_cv_scores_df = make_cv_scores_df(cv_obj, label='BG', df=rand_cv_scores_df)
plot_cv_scores(rand_cv_scores_df)
%%time
if run_grid_search:
pipe = models['ADB']
param_grid = parameters['ADB']
# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.recall_score)
# Calling GridSearchCV
grid_cv = GridSearchCV(estimator=pipe, param_grid=param_grid, scoring=scorer, cv=5)
# Fitting parameters in GridSeachCV
grid_cv.fit(X_train, y_train)
ad_tuned_grid = grid_cv.best_estimator_
print(
"Best Parameters:{} \nScore: {}".format(grid_cv.best_params_, grid_cv.best_score_)
)
print(ad_tuned_grid)
Best Parameters:{'ad__base_estimator': DecisionTreeClassifier(max_depth=2, random_state=314159), 'ad__learning_rate': 1, 'ad__n_estimators': 50}
Score: 0.8893809413401345
Pipeline(steps=[('scaler', StandardScaler()),
('ad',
AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=2,
random_state=314159),
learning_rate=1, random_state=314159))])
CPU times: user 5min 19s, sys: 352 ms, total: 5min 20s
Wall time: 5min 20s
%%time
pipe = models['ADB']
param_grid = parameters['ADB']
# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.recall_score)
#Calling RandomizedSearchCV
cv_obj = RandomizedSearchCV(estimator=pipe, param_distributions=param_grid, n_iter=50, scoring=scorer, cv=5, random_state=random_state)
#Fitting parameters in RandomizedSearchCV
cv_obj.fit(X_train,y_train)
ad_tuned_rand = cv_obj.best_estimator_
print("Best parameters are {} with CV score={}:" .format(cv_obj.best_params_, cv_obj.best_score_))
print(ad_tuned_rand)
Best parameters are {'ad__n_estimators': 60, 'ad__learning_rate': 1, 'ad__base_estimator': DecisionTreeClassifier(max_depth=2, random_state=314159)} with CV score=0.8841139191591314:
Pipeline(steps=[('scaler', StandardScaler()),
('ad',
AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=2,
random_state=314159),
learning_rate=1, n_estimators=60,
random_state=314159))])
CPU times: user 1min 50s, sys: 281 ms, total: 1min 50s
Wall time: 1min 51s
rand_cv_scores_df = make_cv_scores_df(cv_obj, label='AD', df=rand_cv_scores_df)
plot_cv_scores(rand_cv_scores_df)
%%time
if run_grid_search:
pipe = models['GBM']
param_grid = parameters['GBM-grid']
# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.recall_score)
# Calling GridSearchCV
grid_cv = GridSearchCV(estimator=pipe, param_grid=param_grid, scoring=scorer, cv=5)
# Fitting parameters in GridSeachCV
grid_cv.fit(X_train, y_train)
gb_tuned_grid = grid_cv.best_estimator_
print(
"Best Parameters:{} \nScore: {}".format(grid_cv.best_params_, grid_cv.best_score_)
)
print(gb_tuned_grid)
Best Parameters:{'gb__max_depth': 3, 'gb__max_features': 0.5, 'gb__n_estimators': 300, 'gb__subsample': 0.8}
Score: 0.8919970631424375
Pipeline(steps=[('scaler', StandardScaler()),
('gb',
GradientBoostingClassifier(max_features=0.5, n_estimators=300,
random_state=314159,
subsample=0.8))])
CPU times: user 16min 15s, sys: 1.78 s, total: 16min 16s
Wall time: 16min 18s
%%time
pipe = models['GBM']
param_grid = parameters['GBM']
# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.recall_score)
#Calling RandomizedSearchCV
cv_obj = RandomizedSearchCV(estimator=pipe, param_distributions=param_grid, n_iter=50, scoring=scorer, cv=5, random_state=random_state)
#Fitting parameters in RandomizedSearchCV
cv_obj.fit(X_train,y_train)
gb_tuned_rand = cv_obj.best_estimator_
print("Best parameters are {} with CV score={}:" .format(cv_obj.best_params_, cv_obj.best_score_))
print(gb_tuned_rand)
Best parameters are {'gb__subsample': 0.9, 'gb__n_estimators': 300, 'gb__max_features': 'auto', 'gb__max_depth': 3} with CV score=0.8867223123889019:
Pipeline(steps=[('scaler', StandardScaler()),
('gb',
GradientBoostingClassifier(max_features='auto',
n_estimators=300,
random_state=314159,
subsample=0.9))])
CPU times: user 11min 37s, sys: 682 ms, total: 11min 37s
Wall time: 11min 38s
rand_cv_scores_df = make_cv_scores_df(cv_obj, label='GB', df=rand_cv_scores_df)
plot_cv_scores(rand_cv_scores_df)
%%time
if run_grid_search:
pipe = models['XGB']
param_grid = parameters['XGB-grid']
# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.recall_score)
# Calling GridSearchCV
grid_cv = GridSearchCV(estimator=pipe, param_grid=param_grid, scoring=scorer, cv=5)
# Fitting parameters in GridSeachCV
grid_cv.fit(X_train, y_train)
xg_tuned_grid = grid_cv.best_estimator_
print(
"Best Parameters:{} \nScore: {}".format(grid_cv.best_params_, grid_cv.best_score_)
)
print(xg_tuned_grid)
Best Parameters:{'xg__colsample_bytree': 0.7, 'xg__gamma': 30, 'xg__learning_rate': 0.2, 'xg__max_depth': 6, 'xg__min_child_weight': 0, 'xg__n_estimators': 50, 'xg__scale_pos_weight': 5.224339274738783, 'xg__subsample': 0.8, 'xg__tree_method': 'hist'}
Score: 0.9560785222969319
Pipeline(steps=[('scaler', StandardScaler()),
('xg',
XGBClassifier(base_score=0.5, booster='gbtree',
colsample_bylevel=1, colsample_bynode=1,
colsample_bytree=0.7, eval_metric='logloss',
gamma=30, gpu_id=-1, importance_type='gain',
interaction_constraints='', learning_rate=0.2,
max_delta_step=0, max_depth=6,
min_child_weight=0, missing=nan,
monotone_constraints='()', n_estimators=50,
n_jobs=16, num_parallel_tree=1,
random_state=314159, reg_alpha=0, reg_lambda=1,
scale_pos_weight=5.224339274738783,
subsample=0.8, tree_method='hist',
use_label_encoder=False, validate_parameters=1,
verbosity=None))])
CPU times: user 1h 16min 19s, sys: 2min 14s, total: 1h 18min 33s
Wall time: 5min 7s
%%time
pipe = models['XGB']
param_grid = parameters['XGB']
# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.recall_score)
#Calling RandomizedSearchCV
cv_obj = RandomizedSearchCV(estimator=pipe, param_distributions=param_grid, n_iter=100, scoring=scorer, cv=5, random_state=random_state)
#Fitting parameters in RandomizedSearchCV
cv_obj.fit(X_train,y_train)
xg_tuned_rand = cv_obj.best_estimator_
print("Best parameters are {} with CV score={}:" .format(cv_obj.best_params_, cv_obj.best_score_))
print(xg_tuned_rand)
Best parameters are {'xg__tree_method': 'hist', 'xg__subsample': 0.9, 'xg__scale_pos_weight': 5.224339274738783, 'xg__n_estimators': 110, 'xg__min_child_weight': 5, 'xg__max_depth': 10, 'xg__learning_rate': 0.3, 'xg__gamma': 90, 'xg__colsample_bytree': 0.3, 'xg__colsample_bylevel': 0.7} with CV score=0.9516693716670531:
Pipeline(steps=[('scaler', StandardScaler()),
('xg',
XGBClassifier(base_score=0.5, booster='gbtree',
colsample_bylevel=0.7, colsample_bynode=1,
colsample_bytree=0.3, eval_metric='logloss',
gamma=90, gpu_id=-1, importance_type='gain',
interaction_constraints='', learning_rate=0.3,
max_delta_step=0, max_depth=10,
min_child_weight=5, missing=nan,
monotone_constraints='()', n_estimators=110,
n_jobs=16, num_parallel_tree=1,
random_state=314159, reg_alpha=0, reg_lambda=1,
scale_pos_weight=5.224339274738783,
subsample=0.9, tree_method='hist',
use_label_encoder=False, validate_parameters=1,
verbosity=None))])
CPU times: user 23min 36s, sys: 55.1 s, total: 24min 31s
Wall time: 1min 41s
rand_cv_scores_df = make_cv_scores_df(cv_obj, label='XG', df=rand_cv_scores_df)
plot_cv_scores(rand_cv_scores_df)
if run_grid_search:
grid_tuned_models = {'LR-grid':lr_tuned_grid,'LR-over-grid':lr_over_tuned_grid,'LR-under_grid':lr_under_tuned_grid, 'RF-grid':rf_tuned_grid, 'DT-grid':dt_tuned_grid, \
'BG-grid':bg_tuned_grid, 'AD-grid':ad_tuned_grid, 'GB-grid':gb_tuned_grid, 'XG-grid':xg_tuned_grid}
grid_score_df = None
for key in grid_tuned_models:
#display(grid_score_df)
grid_score_df = model_score_df(grid_tuned_models[key], X_train, y_train, X_test, y_test,\
label=key, score_df=grid_score_df)
display(grid_score_df)
| LR-grid-train | LR-grid-test | LR-over-grid-train | LR-over-grid-test | LR-under_grid-train | LR-under_grid-test | RF-grid-train | RF-grid-test | DT-grid-train | DT-grid-test | BG-grid-train | BG-grid-test | AD-grid-train | AD-grid-test | GB-grid-train | GB-grid-test | XG-grid-train | XG-grid-test | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| model_accuracy | 0.847207 | 0.845673 | 0.884029 | 0.886147 | 0.840576 | 0.841066 | 0.964306 | 0.943731 | 0.732506 | 0.729187 | 0.924097 | 0.913458 | 0.985045 | 0.967753 | 0.992381 | 0.971043 | 0.962331 | 0.941099 |
| model_recall_sensitivity_TPR | 0.854258 | 0.852459 | 0.691835 | 0.690574 | 0.856014 | 0.856557 | 0.980685 | 0.893443 | 0.964004 | 0.959016 | 0.727831 | 0.692623 | 0.947322 | 0.895492 | 0.964881 | 0.881148 | 0.981563 | 0.948770 |
| model_precision | 0.514815 | 0.511685 | 0.625894 | 0.633459 | 0.502318 | 0.503008 | 0.828635 | 0.785586 | 0.371825 | 0.368214 | 0.784295 | 0.749446 | 0.959111 | 0.902893 | 0.987421 | 0.934783 | 0.819648 | 0.750405 |
| Specificity_TNR | 0.845856 | 0.844375 | 0.920827 | 0.923559 | 0.837620 | 0.838103 | 0.961170 | 0.953352 | 0.688183 | 0.685221 | 0.961674 | 0.955704 | 0.992268 | 0.981576 | 0.997647 | 0.988240 | 0.958649 | 0.939632 |
| model_f1 | 0.642456 | 0.639508 | 0.657214 | 0.660784 | 0.633117 | 0.633813 | 0.898271 | 0.836050 | 0.536657 | 0.532121 | 0.755009 | 0.719915 | 0.953180 | 0.899177 | 0.976021 | 0.907173 | 0.893328 | 0.838009 |
| FPR | 0.154144 | 0.155625 | 0.079173 | 0.076441 | 0.162380 | 0.161897 | 0.038830 | 0.046648 | 0.311817 | 0.314779 | 0.038326 | 0.044296 | 0.007732 | 0.018424 | 0.002353 | 0.011760 | 0.041351 | 0.060368 |
rand_tuned_models = {'LR-rand':lr_tuned_rand, 'LR-over-rand':lr_over_tuned_rand, 'LR-under-rand':lr_under_tuned_rand, 'RF-rand':rf_tuned_rand, 'DT-rand':dt_tuned_rand, \
'BG-rand':bg_tuned_rand, 'AD-rand':ad_tuned_rand, 'GB-rand':gb_tuned_rand, 'XG-rand':xg_tuned_rand}
rand_score_df = None
for key in rand_tuned_models:
rand_score_df = model_score_df(rand_tuned_models[key], X_train, y_train, X_test, y_test, \
label=key, score_df=rand_score_df)
rand_score_df
| LR-rand-train | LR-rand-test | LR-over-rand-train | LR-over-rand-test | LR-under-rand-train | LR-under-rand-test | RF-rand-train | RF-rand-test | DT-rand-train | DT-rand-test | BG-rand-train | BG-rand-test | AD-rand-train | AD-rand-test | GB-rand-train | GB-rand-test | XG-rand-train | XG-rand-test | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| model_accuracy | 0.847207 | 0.845673 | 0.884029 | 0.886147 | 0.840576 | 0.841066 | 0.963742 | 0.944061 | 0.732506 | 0.729187 | 0.924097 | 0.913458 | 0.987302 | 0.968411 | 0.993369 | 0.972359 | 0.945119 | 0.940112 |
| model_recall_sensitivity_TPR | 0.854258 | 0.852459 | 0.691835 | 0.690574 | 0.856014 | 0.856557 | 0.978929 | 0.893443 | 0.964004 | 0.959016 | 0.727831 | 0.692623 | 0.951712 | 0.891393 | 0.971905 | 0.885246 | 0.973661 | 0.963115 |
| model_precision | 0.514815 | 0.511685 | 0.625894 | 0.633459 | 0.502318 | 0.503008 | 0.827151 | 0.787004 | 0.371825 | 0.368214 | 0.784295 | 0.749446 | 0.968722 | 0.910042 | 0.986631 | 0.939130 | 0.755450 | 0.741325 |
| Specificity_TNR | 0.845856 | 0.844375 | 0.920827 | 0.923559 | 0.837620 | 0.838103 | 0.960834 | 0.953744 | 0.688183 | 0.685221 | 0.961674 | 0.955704 | 0.994117 | 0.983144 | 0.997479 | 0.989024 | 0.939654 | 0.935711 |
| model_f1 | 0.642456 | 0.639508 | 0.657214 | 0.660784 | 0.633117 | 0.633813 | 0.896663 | 0.836852 | 0.536657 | 0.532121 | 0.755009 | 0.719915 | 0.960142 | 0.900621 | 0.979213 | 0.911392 | 0.850786 | 0.837790 |
| FPR | 0.154144 | 0.155625 | 0.079173 | 0.076441 | 0.162380 | 0.161897 | 0.039166 | 0.046256 | 0.311817 | 0.314779 | 0.038326 | 0.044296 | 0.005883 | 0.016856 | 0.002521 | 0.010976 | 0.060346 | 0.064289 |
XGBoost: Does well on accuracy and recall, precision is suboptimal
Business implications discussed later
rows = ['LR','LR-over','LR-under','RF', 'DT', 'BG', 'AD', 'GB', 'XG']
dftest = pd.DataFrame(np.asarray(rand_score_df.iloc[1,np.arange(1,18,2)]), columns = ['Test'], index=rows)
dftrain = pd.DataFrame(np.asarray(rand_score_df.iloc[1,np.arange(0,17,2)]), columns = ['Train'], index=rows)
outdf = pd.merge(dftest, dftrain, left_index=True, right_index=True)
#outdf
import matplotlib
matplotlib.rc_file_defaults()
ax1 = sns.set_style(style=None, rc=None )
ax1 = outdf.plot(kind='bar', title='Recall (RandomSearchCV) - Scores (LHS); difference as line (RHS)')
z = outdf.Train - outdf.Test
ax2=ax1.twinx()
z.plot(kind='line', ax=ax2, color='black', marker='o');
rows = ['LR','LR-over','LR-under','RF', 'DT', 'BG', 'AD', 'GB', 'XG']
dftest = pd.DataFrame(np.asarray(rand_score_df.iloc[2,np.arange(1,18,2)]), columns = ['Test'], index=rows)
dftrain = pd.DataFrame(np.asarray(rand_score_df.iloc[2,np.arange(0,17,2)]), columns = ['Train'], index=rows)
outdf = pd.merge(dftest, dftrain, left_index=True, right_index=True)
#outdf
matplotlib.rc_file_defaults()
ax1 = sns.set_style(style=None, rc=None )
ax1 = outdf.plot(kind='bar', title='Precision (RandomSearchCV) - Scores (LHS); difference as line (RHS)')
z = outdf.Train - outdf.Test
ax2=ax1.twinx()
z.plot(kind='line', ax=ax2, color='black', marker='o');
rows = ['LR','LR-over','LR-under','RF', 'DT', 'BG', 'AD', 'GB', 'XG']
dftest = pd.DataFrame(np.asarray(rand_score_df.iloc[4,np.arange(1,18,2)]), columns = ['Test'], index=rows)
dftrain = pd.DataFrame(np.asarray(rand_score_df.iloc[4,np.arange(0,17,2)]), columns = ['Train'], index=rows)
outdf = pd.merge(dftest, dftrain, left_index=True, right_index=True)
#outdf
matplotlib.rc_file_defaults()
ax1 = sns.set_style(style=None, rc=None )
ax1 = outdf.plot(kind='bar', title='F1 (RandomSearchCV) - - Scores (LHS); difference as line (RHS)')
z = outdf.Train - outdf.Test
ax2=ax1.twinx()
z.plot(kind='line', ax=ax2, color='black', marker='o');
rand_score_df.to_csv('out.csv') #outputting scores to csv
key = 'XG-rand'
feature_names = X_train.columns
importances = rand_tuned_models[key][1].feature_importances_
indices = np.argsort(importances)
plt.figure(figsize=(5, 5))
plt.suptitle("Feature Importances-"+key)
plt.barh(range(len(indices)), importances[indices], color="violet", align="center")
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel("Relative Importance")
plt.show()
key = 'GB-rand'
feature_names = X_train.columns
importances = rand_tuned_models[key][1].feature_importances_
indices = np.argsort(importances)
plt.figure(figsize=(5, 5))
plt.suptitle("Feature Importances-"+key)
plt.barh(range(len(indices)), importances[indices], color="violet", align="center")
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel("Relative Importance")
plt.show()
key = 'XG-grid'
feature_names = X_train.columns
importances = grid_tuned_models[key][1].feature_importances_
indices = np.argsort(importances)
plt.figure(figsize=(5, 5))
plt.suptitle("Feature Importances-"+key)
plt.barh(range(len(indices)), importances[indices], color="violet", align="center")
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel("Relative Importance")
plt.show()
key = 'GB-grid'
feature_names = X_train.columns
importances = grid_tuned_models[key][1].feature_importances_
indices = np.argsort(importances)
plt.figure(figsize=(5, 5))
plt.suptitle("Feature Importances-"+key)
plt.barh(range(len(indices)), importances[indices], color="violet", align="center")
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel("Relative Importance")
plt.show()